From 31c92d3d164350019fd8a528eee24e8469058788 Mon Sep 17 00:00:00 2001 From: Chester Ismay Date: Sun, 21 Jun 2026 09:46:52 -0700 Subject: [PATCH 1/4] tidy_summary: compute Q1/Q3 with linear interpolation (configurable) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit polars `.quantile()` defaults to "nearest", which diverges from R's quantile() (type 7), NumPy, and the quartiles drawn by Plotly/ggplot2 boxplots. Add an `interpolation="linear"` parameter (new default) so tidy_summary's five-number summary matches the standard textbook/R convention and the boxplots shown beside it. Pass interpolation="nearest" to restore the old behavior. Queued for the next release (0.1.1) — not yet published. Co-Authored-By: Claude Opus 4.8 --- moderndive/modeling.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/moderndive/modeling.py b/moderndive/modeling.py index c4834f5..b3aeb14 100644 --- a/moderndive/modeling.py +++ b/moderndive/modeling.py @@ -362,13 +362,23 @@ def get_regression_summaries(model, digits: int = 3) -> pl.DataFrame: return table.with_columns(pl.col(float_cols).round(digits)) -def tidy_summary(data, columns: list[str] | None = None, digits: int = 3) -> pl.DataFrame: +def tidy_summary( + data, + columns: list[str] | None = None, + digits: int = 3, + interpolation: str = "linear", +) -> pl.DataFrame: """Per-variable summary statistics for the selected columns. Mirrors the R ``moderndive::tidy_summary`` column layout: ``column, n, group, type, min, Q1, mean, median, Q3, max, sd``. Numeric columns get the five-number summary + mean/sd; non-numeric columns report ``n`` and ``type`` with the numeric fields left null. + + ``interpolation`` selects how ``Q1``/``Q3`` are computed when a quartile falls + between two observations. The default ``"linear"`` matches R's ``quantile()`` + (type 7), NumPy, and the quartiles drawn by Plotly/ggplot2 boxplots; pass any + other polars quantile method (e.g. ``"nearest"``) to override. """ df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data) columns = columns or df.columns @@ -395,10 +405,10 @@ def tidy_summary(data, columns: list[str] | None = None, digits: int = 3) -> pl. s = series.drop_nulls() row.update( min=round(float(s.min()), digits), - Q1=round(float(s.quantile(0.25)), digits), + Q1=round(float(s.quantile(0.25, interpolation=interpolation)), digits), mean=round(float(s.mean()), digits), median=round(float(s.median()), digits), - Q3=round(float(s.quantile(0.75)), digits), + Q3=round(float(s.quantile(0.75, interpolation=interpolation)), digits), max=round(float(s.max()), digits), sd=round(float(s.std()), digits), ) From 3c0f89084aa639b6f7560194456093f09450b43f Mon Sep 17 00:00:00 2001 From: Chester Ismay Date: Sun, 21 Jun 2026 09:53:51 -0700 Subject: [PATCH 2/4] docs: add CHANGELOG Unreleased entry for tidy_summary linear interpolation Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 283a416..6d57472 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Unreleased + +### Changed + +- `tidy_summary()` now computes `Q1`/`Q3` with **linear** quantile interpolation + by default (matching R's `quantile()` type 7, NumPy, and the quartiles drawn by + Plotly/ggplot2 boxplots) instead of polars' default `"nearest"`. A new + `interpolation=` parameter exposes the choice; pass `interpolation="nearest"` + to restore the previous behavior. + ## 0.1.0 (2026-06-20) Initial release of the Python companion to **ModernDive: Statistical Inference From f1a41c2db7109a40b5e07be6e258ab1fe010ad5e Mon Sep 17 00:00:00 2001 From: Chester Ismay Date: Sun, 21 Jun 2026 10:09:05 -0700 Subject: [PATCH 3/4] chisq_test: Yates correction by default (correct=True), matching R MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R's chisq.test and this package's prop_test both default to Yates' continuity correction; chisq_test previously hardcoded correction=False, an inconsistency. Add a `correct: bool = True` parameter (applied only to 2x2 tables, like R and scipy); pass correct=False for the uncorrected Pearson statistic that matches the simulation-based calculate(stat="Chisq"). 322 tests pass at 100% coverage. Queued for 0.1.1 — not yet released. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 5 +++++ moderndive/infer/wrappers.py | 10 +++++++++- tests/test_infer_parity.py | 8 +++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d57472..9924eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ Plotly/ggplot2 boxplots) instead of polars' default `"nearest"`. A new `interpolation=` parameter exposes the choice; pass `interpolation="nearest"` to restore the previous behavior. +- `chisq_test()` now applies **Yates' continuity correction by default** for the + test of independence (`correct=True`), matching R's `chisq.test` and the + package's `prop_test`. As in R, the correction only affects 2x2 tables. Pass + `correct=False` for the uncorrected Pearson statistic (e.g. to match the + simulation-based `calculate(stat="Chisq")`). ## 0.1.0 (2026-06-20) diff --git a/moderndive/infer/wrappers.py b/moderndive/infer/wrappers.py index a1de893..272f168 100644 --- a/moderndive/infer/wrappers.py +++ b/moderndive/infer/wrappers.py @@ -181,6 +181,7 @@ def chisq_test( response: str | None = None, explanatory: str | None = None, p: dict | None = None, + correct: bool = True, ) -> pl.DataFrame: """Tidy chi-squared test. @@ -188,6 +189,13 @@ def chisq_test( response and a ``p={level: probability, ...}`` mapping, it is a **goodness-of-fit** test against those hypothesized proportions. Returns ``statistic``, ``chisq_df``, ``p_value``. + + ``correct`` applies Yates' continuity correction to the test of independence, + matching R's ``chisq.test`` default (``correct=TRUE``) and ``prop_test``; like + R, the correction only affects 2x2 tables (one degree of freedom). Pass + ``correct=False`` for the uncorrected Pearson statistic (e.g. to match the + simulation-based ``calculate(stat="Chisq")``). It does not apply to the + goodness-of-fit case. """ from scipy import stats @@ -210,7 +218,7 @@ def chisq_test( ) sub = data.select(resp, expl).drop_nulls() table = sub.to_pandas().pivot_table(index=resp, columns=expl, aggfunc="size", fill_value=0) - chi2, pval, dof, _ = stats.chi2_contingency(table.to_numpy(), correction=False) + chi2, pval, dof, _ = stats.chi2_contingency(table.to_numpy(), correction=correct) return pl.DataFrame( {"statistic": [float(chi2)], "chisq_df": [int(dof)], "p_value": [float(pval)]} ) diff --git a/tests/test_infer_parity.py b/tests/test_infer_parity.py index 5c75e64..339f094 100644 --- a/tests/test_infer_parity.py +++ b/tests/test_infer_parity.py @@ -127,9 +127,15 @@ def test_t_test_one_sample_tidy_columns(): def test_chisq_test_df_and_stat(): + # Default applies Yates' continuity correction (matches R's chisq.test and + # prop_test); on this weak 2x2 association the corrected statistic is ~0. out = chisq_test(_yawn(), formula="yawn ~ group") assert out["chisq_df"][0] == 1 - assert out["statistic"][0] > 0 + assert out["statistic"][0] >= 0 + # The uncorrected Pearson statistic is strictly positive and larger. + raw = chisq_test(_yawn(), formula="yawn ~ group", correct=False) + assert raw["statistic"][0] > 0 + assert raw["statistic"][0] > out["statistic"][0] # --- bias-corrected CI ---------------------------------------------------- From 2d194c5452b28ce02aa1c670ddbfb23fbd0d8e44 Mon Sep 17 00:00:00 2001 From: Chester Ismay Date: Mon, 22 Jun 2026 08:01:20 -0700 Subject: [PATCH 4/4] make tidy_summary/chisq_test changes non-breaking (preserve 0.1.0 defaults) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both additions previously flipped a default, changing results for existing 0.1.0 users. Revert the defaults to 0.1.0 behavior and keep the new behavior opt-in: - tidy_summary(interpolation=): default back to "nearest"; pass "linear" for R's quantile() type 7 / Plotly/ggplot2 boxplot quartiles. - chisq_test(correct=): default back to False (uncorrected Pearson, matching calculate(stat="Chisq")); pass correct=True for R's chisq.test/prop_test. Both are now purely additive (new opt-in parameters) and safe for a 0.1.1 patch. Also document a CHANGELOG/RELEASING convention requiring any breaking change to get a dedicated "⚠️ Breaking changes" section and a non-patch version bump. 322 tests, 100% coverage, ruff clean. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_017CTL1QSTg1DmDUpqYuPEog --- CHANGELOG.md | 33 +++++++++++++++++++++------------ RELEASING.md | 4 ++++ moderndive/infer/wrappers.py | 12 ++++++------ moderndive/modeling.py | 9 +++++---- tests/test_infer_parity.py | 14 +++++++------- 5 files changed, 43 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9924eab..90f9082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,28 @@ # Changelog + + ## Unreleased -### Changed - -- `tidy_summary()` now computes `Q1`/`Q3` with **linear** quantile interpolation - by default (matching R's `quantile()` type 7, NumPy, and the quartiles drawn by - Plotly/ggplot2 boxplots) instead of polars' default `"nearest"`. A new - `interpolation=` parameter exposes the choice; pass `interpolation="nearest"` - to restore the previous behavior. -- `chisq_test()` now applies **Yates' continuity correction by default** for the - test of independence (`correct=True`), matching R's `chisq.test` and the - package's `prop_test`. As in R, the correction only affects 2x2 tables. Pass - `correct=False` for the uncorrected Pearson statistic (e.g. to match the - simulation-based `calculate(stat="Chisq")`). +### Added + +- `tidy_summary()` gains an `interpolation=` parameter controlling how `Q1`/`Q3` + are computed. The default is unchanged from 0.1.0 (`"nearest"`); pass + `interpolation="linear"` for R's `quantile()` type 7 — also NumPy's default and + the quartiles drawn by Plotly/ggplot2 boxplots. **Non-breaking** (default + preserved). +- `chisq_test()` gains a `correct=` parameter for Yates' continuity correction on + the test of independence. The default is unchanged from 0.1.0 (`correct=False`, + the uncorrected Pearson statistic, matching the simulation-based + `calculate(stat="Chisq")`); pass `correct=True` to match R's + `chisq.test`/`prop_test`. As in R, the correction only affects 2x2 tables. + **Non-breaking** (default preserved). ## 0.1.0 (2026-06-20) diff --git a/RELEASING.md b/RELEASING.md index c556294..a02c005 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -46,6 +46,10 @@ unzip -l dist/*.whl | grep -c parquet # sanity: bundled datasets are present ## Cutting a release 1. **Pick the version** (PyPI versions are immutable — you can't re-upload one). + Any change that can alter existing users' results is **breaking**: it needs a + dedicated `### ⚠️ Breaking changes` section in `CHANGELOG.md` (what changed, + how to restore the old behavior, why) and a **minor/major** bump — never a + patch. Prefer adding an opt-in parameter with the old default to avoid breaking. 2. **Bump `version`** in `pyproject.toml`. 3. **Update `CHANGELOG.md`**: rename the `## Unreleased` section to `## (YYYY-MM-DD)` and start a fresh empty `## Unreleased` above it. diff --git a/moderndive/infer/wrappers.py b/moderndive/infer/wrappers.py index 272f168..6d60e63 100644 --- a/moderndive/infer/wrappers.py +++ b/moderndive/infer/wrappers.py @@ -181,7 +181,7 @@ def chisq_test( response: str | None = None, explanatory: str | None = None, p: dict | None = None, - correct: bool = True, + correct: bool = False, ) -> pl.DataFrame: """Tidy chi-squared test. @@ -190,11 +190,11 @@ def chisq_test( test against those hypothesized proportions. Returns ``statistic``, ``chisq_df``, ``p_value``. - ``correct`` applies Yates' continuity correction to the test of independence, - matching R's ``chisq.test`` default (``correct=TRUE``) and ``prop_test``; like - R, the correction only affects 2x2 tables (one degree of freedom). Pass - ``correct=False`` for the uncorrected Pearson statistic (e.g. to match the - simulation-based ``calculate(stat="Chisq")``). It does not apply to the + ``correct`` applies Yates' continuity correction to the test of independence. + It defaults to ``False`` — the uncorrected Pearson statistic, matching + moderndive 0.1.0 and the simulation-based ``calculate(stat="Chisq")``. Pass + ``correct=True`` to match R's ``chisq.test``/``prop_test`` default; like R, the + correction only affects 2x2 tables (one degree of freedom) and never the goodness-of-fit case. """ from scipy import stats diff --git a/moderndive/modeling.py b/moderndive/modeling.py index b3aeb14..72c3d75 100644 --- a/moderndive/modeling.py +++ b/moderndive/modeling.py @@ -366,7 +366,7 @@ def tidy_summary( data, columns: list[str] | None = None, digits: int = 3, - interpolation: str = "linear", + interpolation: str = "nearest", ) -> pl.DataFrame: """Per-variable summary statistics for the selected columns. @@ -376,9 +376,10 @@ def tidy_summary( report ``n`` and ``type`` with the numeric fields left null. ``interpolation`` selects how ``Q1``/``Q3`` are computed when a quartile falls - between two observations. The default ``"linear"`` matches R's ``quantile()`` - (type 7), NumPy, and the quartiles drawn by Plotly/ggplot2 boxplots; pass any - other polars quantile method (e.g. ``"nearest"``) to override. + between two observations. The default ``"nearest"`` matches moderndive 0.1.0 + (polars' default). Pass ``interpolation="linear"`` for R's ``quantile()`` type + 7 — also NumPy's default and the quartiles drawn by Plotly/ggplot2 boxplots — + or any other polars quantile method. """ df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data) columns = columns or df.columns diff --git a/tests/test_infer_parity.py b/tests/test_infer_parity.py index 339f094..1cd32c2 100644 --- a/tests/test_infer_parity.py +++ b/tests/test_infer_parity.py @@ -127,15 +127,15 @@ def test_t_test_one_sample_tidy_columns(): def test_chisq_test_df_and_stat(): - # Default applies Yates' continuity correction (matches R's chisq.test and - # prop_test); on this weak 2x2 association the corrected statistic is ~0. + # Default is the uncorrected Pearson statistic (matches moderndive 0.1.0 and + # the simulation-based calculate(stat="Chisq")) — strictly positive here. out = chisq_test(_yawn(), formula="yawn ~ group") assert out["chisq_df"][0] == 1 - assert out["statistic"][0] >= 0 - # The uncorrected Pearson statistic is strictly positive and larger. - raw = chisq_test(_yawn(), formula="yawn ~ group", correct=False) - assert raw["statistic"][0] > 0 - assert raw["statistic"][0] > out["statistic"][0] + assert out["statistic"][0] > 0 + # Opt into Yates' continuity correction (R's chisq.test default); on this weak + # 2x2 association the corrected statistic is smaller. + corrected = chisq_test(_yawn(), formula="yawn ~ group", correct=True) + assert corrected["statistic"][0] < out["statistic"][0] # --- bias-corrected CI ----------------------------------------------------