Skip to content

Calibration

neureptrace.calibration

aggregate_reliability_bins(csv_paths)

Aggregate reliability-bin CSVs emitted by neureptrace.mne_time_decode.

Source code in src/neureptrace/calibration.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def aggregate_reliability_bins(csv_paths: list[Path]) -> pd.DataFrame:
    """Aggregate reliability-bin CSVs emitted by ``neureptrace.mne_time_decode``."""
    if not csv_paths:
        raise ValueError("At least one calibration-bin CSV path is required.")

    frames = []
    for csv_path in csv_paths:
        frame = _validate_reliability_bins(pd.read_csv(csv_path), csv_path)
        if "decoder" not in frame.columns:
            frame["decoder"] = "overall"
        if "emission_mode" not in frame.columns:
            frame["emission_mode"] = "calibrated"
        frame["source_file"] = csv_path.name
        frames.append(frame)

    bins = pd.concat(frames, ignore_index=True)
    group_columns = ["decoder", "emission_mode", "time", "bin", "bin_left", "bin_right"]
    rows = []
    for keys, group in bins.groupby(group_columns, sort=True):
        n_samples = int(group["n_samples"].sum())
        if n_samples:
            weights = group["n_samples"] / n_samples
            accuracy = float((group["accuracy"].fillna(0.0) * weights).sum())
            confidence = float((group["confidence"].fillna(0.0) * weights).sum())
        else:
            accuracy = float("nan")
            confidence = float("nan")
        rows.append(
            {
                **dict(zip(group_columns, keys, strict=True)),
                "n_samples": n_samples,
                "accuracy": accuracy,
                "confidence": confidence,
                "gap": accuracy - confidence if n_samples else float("nan"),
            }
        )
    return pd.DataFrame(rows)

build_calibration_report(summary_csv, *, baseline_window=(-0.1, 0.0), effect_window=(0.1, 0.8))

Build a Markdown report that foregrounds calibration metrics.

Source code in src/neureptrace/calibration.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
def build_calibration_report(
    summary_csv: Path,
    *,
    baseline_window: tuple[float, float] = (-0.1, 0.0),
    effect_window: tuple[float, float] = (0.1, 0.8),
) -> str:
    """Build a Markdown report that foregrounds calibration metrics."""
    summary = summarize_calibration_metrics(
        pd.read_csv(summary_csv),
        baseline_window=baseline_window,
        effect_window=effect_window,
    )
    has_emission_mode = "emission_mode" in summary.columns
    lines = [
        "# NeuRepTrace Calibration Report",
        "",
        f"- Summary CSV: `{summary_csv}`",
        f"- Baseline window: {_format_float(baseline_window[0])} to {_format_float(baseline_window[1])} s",
        f"- Effect window: {_format_float(effect_window[0])} to {_format_float(effect_window[1])} s",
        "",
    ]
    if has_emission_mode:
        lines.extend(
            [
                "| Decoder | Emission mode | Subjects | Effect ECE | Effect Brier | Effect log loss | Effect accuracy | Baseline accuracy | Best ECE time (s) | Best ECE | Accuracy at best ECE |",
                "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
            ]
        )
    else:
        lines.extend(
            [
                "| Decoder | Subjects | Effect ECE | Effect Brier | Effect log loss | Effect accuracy | Baseline accuracy | Best ECE time (s) | Best ECE | Accuracy at best ECE |",
                "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
            ]
        )
    for row in summary.itertuples(index=False):
        emission_prefix = f"| {row.decoder} | {row.emission_mode} |" if has_emission_mode else f"| {row.decoder} |"
        lines.append(
            f"{emission_prefix} {row.n_subjects} | {_format_float(row.effect_ece_mean)} | {_format_float(row.effect_brier_mean)} | "
            f"{_format_float(row.effect_log_loss_mean)} | {_format_float(row.effect_accuracy_mean)} | {_format_float(row.baseline_accuracy_mean)} | "
            f"{_format_float(row.best_ece_time)} | {_format_float(row.best_ece)} | {_format_float(row.accuracy_at_best_ece)} |"
        )
    lines.append("")
    return "\n".join(lines)

summarize_calibration_metrics(summary, *, baseline_window=(-0.1, 0.0), effect_window=(0.1, 0.8))

Summarize accuracy and calibration metrics over benchmark time windows.

Source code in src/neureptrace/calibration.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def summarize_calibration_metrics(
    summary: pd.DataFrame,
    *,
    baseline_window: tuple[float, float] = (-0.1, 0.0),
    effect_window: tuple[float, float] = (0.1, 0.8),
) -> pd.DataFrame:
    """Summarize accuracy and calibration metrics over benchmark time windows."""
    summary = _validate_calibration_summary(summary)

    group_columns = _present_group_columns(summary)
    group_items = summary.groupby(group_columns, sort=True) if group_columns else [("overall", summary)]
    rows = []
    for keys, frame in group_items:
        key_values = keys if isinstance(keys, tuple) else (keys,)
        group_values = dict(zip(group_columns, key_values, strict=True)) if group_columns else {"decoder": "overall"}
        effect = frame[(frame["time"] >= effect_window[0]) & (frame["time"] <= effect_window[1])]
        if effect.empty:
            raise ValueError(f"No time points found in effect window [{effect_window[0]}, {effect_window[1]}].")
        best_ece = effect.loc[effect["ece_mean"].idxmin()]
        rows.append(
            {
                **group_values,
                "n_subjects": int(frame["n_subjects"].max()),
                "baseline_accuracy_mean": _window_mean(frame, "accuracy_mean", *baseline_window),
                "effect_accuracy_mean": _window_mean(frame, "accuracy_mean", *effect_window),
                "effect_log_loss_mean": _window_mean(frame, "log_loss_mean", *effect_window),
                "effect_brier_mean": _window_mean(frame, "brier_mean", *effect_window),
                "effect_ece_mean": _window_mean(frame, "ece_mean", *effect_window),
                "best_ece_time": float(best_ece["time"]),
                "best_ece": float(best_ece["ece_mean"]),
                "accuracy_at_best_ece": float(best_ece["accuracy_mean"]),
                "brier_at_best_ece": float(best_ece["brier_mean"]),
                "log_loss_at_best_ece": float(best_ece["log_loss_mean"]),
            }
        )

    return pd.DataFrame(rows).sort_values(["effect_ece_mean", "effect_brier_mean", "effect_log_loss_mean"]).reset_index(drop=True)