Skip to content

Metrics

neureptrace.metrics

brier_score_multiclass(probabilities, labels)

Compute multiclass Brier score using one-hot targets.

Source code in src/neureptrace/metrics/__init__.py
179
180
181
182
183
184
185
186
def brier_score_multiclass(probabilities: np.ndarray, labels: np.ndarray) -> float:
    """Compute multiclass Brier score using one-hot targets."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None

    targets = np.zeros_like(probabilities, dtype=float)
    targets[np.arange(labels.shape[0]), labels] = 1.0
    return float(np.mean(np.sum((probabilities - targets) ** 2, axis=1)))

compare_prepost_windows(frame, metric_column, pre_window, post_window, time_column='time', group_columns=())

Compare a metric between inclusive pre and post time windows.

Source code in src/neureptrace/metrics/prepost.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def compare_prepost_windows(
    frame: pd.DataFrame,
    metric_column: str,
    pre_window: Window,
    post_window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Compare a metric between inclusive pre and post time windows."""
    group_columns = _normalize_columns(group_columns)
    pre_summary = summarize_window_metric(frame, metric_column, pre_window, time_column=time_column, group_columns=group_columns)
    post_summary = summarize_window_metric(frame, metric_column, post_window, time_column=time_column, group_columns=group_columns)

    pre_summary = pre_summary.rename(
        columns={
            "window_start": "pre_window_start",
            "window_stop": "pre_window_stop",
            "n_rows": "n_pre_rows",
            f"{metric_column}_mean": f"{metric_column}_pre_mean",
            f"{metric_column}_std": f"{metric_column}_pre_std",
            f"{metric_column}_sem": f"{metric_column}_pre_sem",
        }
    )
    post_summary = post_summary.rename(
        columns={
            "window_start": "post_window_start",
            "window_stop": "post_window_stop",
            "n_rows": "n_post_rows",
            f"{metric_column}_mean": f"{metric_column}_post_mean",
            f"{metric_column}_std": f"{metric_column}_post_std",
            f"{metric_column}_sem": f"{metric_column}_post_sem",
        }
    )

    if group_columns:
        merged = pre_summary.merge(post_summary, on=group_columns, how="outer")
    else:
        merged = pd.concat([pre_summary.reset_index(drop=True), post_summary.reset_index(drop=True)], axis=1)
    merged[f"{metric_column}_post_minus_pre"] = merged[f"{metric_column}_post_mean"] - merged[f"{metric_column}_pre_mean"]
    return _sorted_frame(merged.to_dict("records"), group_columns)

confusion_category_enrichment(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, n_permutations=10000, seed=0)

Test whether off-diagonal errors stay within label metadata categories.

Source code in src/neureptrace/metrics/confusion.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def confusion_category_enrichment(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    n_permutations: int | None = 10_000,
    seed: int | None = 0,
) -> pd.DataFrame:
    """Test whether off-diagonal errors stay within label metadata categories."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_enrichment_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
                n_permutations=n_permutations,
                seed=seed,
            )
        )
    return pd.DataFrame(rows).reset_index(drop=True)

confusion_category_matrix(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS)

Summarize directional category-to-category error counts and lifts.

Source code in src/neureptrace/metrics/confusion.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def confusion_category_matrix(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
) -> pd.DataFrame:
    """Summarize directional category-to-category error counts and lifts."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_matrix_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
            )
        )

    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -float(row["category_confusion_lift"]) if np.isfinite(float(row["category_confusion_lift"])) else np.inf,
            -int(row["count"]),
            str(row["category_column"]),
            str(row["true_category"]),
            str(row["predicted_category"]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

confusion_counts(frame, true_column='true_label', predicted_column='predicted_label', group_columns=())

Count true/predicted label pairs in a trial-level prediction table.

Source code in src/neureptrace/metrics/confusion.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def confusion_counts(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Count true/predicted label pairs in a trial-level prediction table."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [true_column, predicted_column, *group_columns])

    working = frame[[*group_columns, true_column, predicted_column]].rename(
        columns={true_column: "true_label", predicted_column: "predicted_label"}
    )
    keys = [*group_columns, "true_label", "predicted_label"]
    counts = working.groupby(keys, dropna=False, sort=True).size().reset_index(name="count")
    return counts.reset_index(drop=True)

confusion_pair_summary(frame, true_column='true_label', predicted_column='predicted_label', group_columns=(), participant_column=None, metadata_frame=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, label_prefix='label')

Summarize off-diagonal errors as unordered, bidirectional label pairs.

Expected counts preserve the true-label and predicted-label error marginals. Metadata columns, when supplied, are copied for both labels and get an additional same_<metadata_column> flag when both sides are known.

Source code in src/neureptrace/metrics/confusion.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def confusion_pair_summary(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_frame: pd.DataFrame | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    label_prefix: str = "label",
) -> pd.DataFrame:
    """Summarize off-diagonal errors as unordered, bidirectional label pairs.

    Expected counts preserve the true-label and predicted-label error marginals.
    Metadata columns, when supplied, are copied for both labels and get an
    additional ``same_<metadata_column>`` flag when both sides are known.
    """
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_confusion_pairs_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                metadata_label_columns,
                label_prefix,
            )
        )

    label_a_column = f"{label_prefix}_a"
    label_b_column = f"{label_prefix}_b"
    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -int(row["total_confusions"]),
            -float(row["mean_directional_rate"]) if np.isfinite(float(row["mean_directional_rate"])) else np.inf,
            _label_sort_key(row[label_a_column]),
            _label_sort_key(row[label_b_column]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

expected_calibration_error(probabilities, labels, *, n_bins=10)

Compute top-label expected calibration error.

Parameters:

Name Type Description Default
probabilities ndarray

Array of shape (n_samples, n_classes) with predicted class probabilities.

required
labels ndarray

Integer class labels of shape (n_samples,).

required
n_bins int

Number of equally spaced confidence bins.

10
Source code in src/neureptrace/metrics/__init__.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def expected_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    *,
    n_bins: int = 10,
) -> float:
    """Compute top-label expected calibration error.

    Parameters
    ----------
    probabilities:
        Array of shape ``(n_samples, n_classes)`` with predicted class probabilities.
    labels:
        Integer class labels of shape ``(n_samples,)``.
    n_bins:
        Number of equally spaced confidence bins.
    """
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    if n_bins < 1:
        raise ValueError("n_bins must be positive")

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels

    edges = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for left, right in zip(edges[:-1], edges[1:]):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        if not np.any(in_bin):
            continue
        bin_weight = np.mean(in_bin)
        bin_accuracy = np.mean(correct[in_bin])
        bin_confidence = np.mean(confidences[in_bin])
        ece += bin_weight * abs(bin_accuracy - bin_confidence)
    return float(ece)

negative_log_likelihood(probabilities, labels, *, eps=1e-15)

Compute mean categorical negative log-likelihood from probabilities.

Source code in src/neureptrace/metrics/__init__.py
189
190
191
192
193
194
195
196
197
198
def negative_log_likelihood(probabilities: np.ndarray, labels: np.ndarray, *, eps: float = 1e-15) -> float:
    """Compute mean categorical negative log-likelihood from probabilities."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    eps = float(eps)
    if not np.isfinite(eps) or eps <= 0.0:
        raise ValueError("eps must be a positive finite value")

    true_probabilities = probabilities[np.arange(labels.shape[0]), labels]
    return float(-np.mean(np.log(np.clip(true_probabilities, eps, 1.0))))

per_class_accuracy(frame, true_column='true_label', predicted_column='predicted_label', participant_column=None, group_columns=())

Summarize one-vs-rest recall/accuracy for each true class.

Source code in src/neureptrace/metrics/confusion.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def per_class_accuracy(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    participant_column: str | None = None,
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one-vs-rest recall/accuracy for each true class."""
    group_columns = _normalize_columns(group_columns)
    required_columns = [true_column, predicted_column, *group_columns]
    if participant_column is not None:
        required_columns.append(participant_column)
    _require_columns(frame, required_columns)

    working_columns = [*group_columns, true_column, predicted_column]
    if participant_column is not None:
        working_columns.append(participant_column)
    working = frame[working_columns].rename(columns={true_column: "true_label", predicted_column: "predicted_label"})
    working["_correct"] = working["true_label"] == working["predicted_label"]

    rows: list[dict[str, object]] = []
    keys = [*group_columns, "true_label"]
    for group_key, group in working.groupby(keys, dropna=False, sort=True):
        row = _group_row(keys, group_key)
        row.update(
            {
                "n_trials": int(len(group)),
                "n_correct": int(group["_correct"].sum()),
                "accuracy": float(group["_correct"].mean()),
            }
        )
        if participant_column is not None:
            row["n_participants"] = int(group[participant_column].nunique(dropna=True))
        rows.append(row)

    return pd.DataFrame(rows).reset_index(drop=True)

rank_class_scores(scores, classes, y_true, *, top_k=(2, 3), row_top_k=3, class_column='class')

Rank true labels in a per-class score matrix and compute top-k metrics.

Missing true labels are counted as top-k failures but are excluded from the finite mean/median rank. If no class-score columns are available, top-k and rank summaries are undefined and returned as NaN.

Source code in src/neureptrace/metrics/ranking.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def rank_class_scores(
    scores: Sequence[Sequence[float]] | np.ndarray | None,
    classes: Sequence | np.ndarray | None,
    y_true: Sequence | np.ndarray,
    *,
    top_k: Sequence[int] = (2, 3),
    row_top_k: int = 3,
    class_column: str = "class",
) -> dict[str, object]:
    """Rank true labels in a per-class score matrix and compute top-k metrics.

    Missing true labels are counted as top-k failures but are excluded from the
    finite mean/median rank. If no class-score columns are available, top-k and
    rank summaries are undefined and returned as ``NaN``.
    """

    y_true = np.asarray(y_true).ravel()
    top_k = tuple(int(k) for k in top_k)
    row_top_k = int(row_top_k)
    if any(k < 1 for k in top_k):
        raise ValueError("top_k values must be positive.")
    if row_top_k < 0:
        raise ValueError("row_top_k must be non-negative.")
    if not class_column:
        raise ValueError("class_column must be non-empty.")

    if scores is None or classes is None:
        return _empty_class_rank_result(y_true, top_k)

    score_matrix = np.asarray(scores, dtype=float)
    class_order = np.asarray(classes).ravel()
    if score_matrix.ndim != 2:
        raise ValueError("scores must be a two-dimensional matrix.")
    if score_matrix.shape[0] != y_true.shape[0]:
        raise ValueError("scores and y_true must contain the same samples.")
    if score_matrix.shape[1] != class_order.size:
        raise ValueError("scores columns must match classes.")
    if not np.all(np.isfinite(score_matrix)):
        raise ValueError("scores must contain only finite values.")
    duplicate_class = _find_duplicate_class_label(class_order)
    if duplicate_class is not None:
        raise ValueError(f"classes must be unique; duplicate label {duplicate_class!r} found.")
    if score_matrix.shape[1] == 0:
        return _empty_class_rank_result(y_true, top_k)

    order = np.argsort(-score_matrix, axis=1, kind="mergesort")
    top_hits = {k: [] for k in top_k}
    ranks: list[float] = []
    rows: list[dict[str, object]] = []
    for sample_index, truth in enumerate(y_true):
        ranked = class_order[order[sample_index]]
        for k in top_k:
            top_hits[k].append(bool(truth in ranked[:k]))
        match = np.flatnonzero(ranked == truth)
        rank = float(match[0] + 1) if match.size else np.nan
        ranks.append(rank)
        row: dict[str, object] = {"true_label_rank": rank, "true_label_score": np.nan}
        true_index = np.flatnonzero(class_order == truth)
        if true_index.size:
            row["true_label_score"] = float(score_matrix[sample_index, true_index[0]])
        for position, class_index in enumerate(order[sample_index, :row_top_k], start=1):
            row[f"rank{position}_{class_column}"] = _as_python_scalar(class_order[class_index])
            row[f"rank{position}_score"] = float(score_matrix[sample_index, class_index])
        rows.append(row)

    true_label_ranks = np.asarray(ranks, dtype=float)
    return {
        "top_k_accuracy": {k: float(np.mean(top_hits[k])) for k in top_k},
        "true_label_ranks": true_label_ranks,
        "mean_true_label_rank": _finite_nanmean(true_label_ranks),
        "median_true_label_rank": _finite_nanmedian(true_label_ranks),
        "rows": rows,
    }

reliability_bins(probabilities, labels, *, n_bins=10)

Summarize top-label reliability bins for calibration plots.

Source code in src/neureptrace/metrics/__init__.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def reliability_bins(
    probabilities: np.ndarray,
    labels: np.ndarray,
    *,
    n_bins: int = 10,
) -> list[dict[str, float | int]]:
    """Summarize top-label reliability bins for calibration plots."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    if n_bins < 1:
        raise ValueError("n_bins must be positive")

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels

    rows: list[dict[str, float | int]] = []
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for bin_index, (left, right) in enumerate(zip(edges[:-1], edges[1:])):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        n_samples = int(np.sum(in_bin))
        if n_samples:
            accuracy = float(np.mean(correct[in_bin]))
            confidence = float(np.mean(confidences[in_bin]))
        else:
            accuracy = float("nan")
            confidence = float("nan")
        rows.append(
            {
                "bin": bin_index,
                "bin_left": float(left),
                "bin_right": float(right),
                "n_samples": n_samples,
                "accuracy": accuracy,
                "confidence": confidence,
                "gap": accuracy - confidence if n_samples else float("nan"),
            }
        )
    return rows

summarize_window_metric(frame, metric_column, window, time_column='time', group_columns=())

Summarize one metric inside an inclusive time window.

Source code in src/neureptrace/metrics/prepost.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def summarize_window_metric(
    frame: pd.DataFrame,
    metric_column: str,
    window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one metric inside an inclusive time window."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [time_column, metric_column, *group_columns])
    window_start, window_stop = _validate_window(window)

    window_frame = frame.loc[(frame[time_column] >= window_start) & (frame[time_column] <= window_stop)]
    if window_frame.empty:
        raise ValueError(f"No rows fall inside window [{window_start}, {window_stop}].")

    rows: list[dict[str, object]] = []
    for group_key, group in _iter_groups(window_frame, group_columns):
        row = _group_row(group_columns, group_key)
        values = pd.to_numeric(group[metric_column], errors="coerce")
        row.update(
            {
                "window_start": window_start,
                "window_stop": window_stop,
                "n_rows": int(values.notna().sum()),
                f"{metric_column}_mean": _float_or_nan(values.mean()),
                f"{metric_column}_std": _float_or_nan(values.std()),
                f"{metric_column}_sem": _float_or_nan(values.sem()),
            }
        )
        rows.append(row)

    return _sorted_frame(rows, group_columns)

top_k_accuracy(probabilities, labels, *, k=1)

Compute top-k classification accuracy from probability rows.

Source code in src/neureptrace/metrics/__init__.py
201
202
203
204
205
206
207
208
209
210
211
212
def top_k_accuracy(probabilities: np.ndarray, labels: np.ndarray, *, k: int = 1) -> float:
    """Compute top-k classification accuracy from probability rows."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    k = int(k)
    if k < 1:
        raise ValueError("k must be positive")
    if k >= probabilities.shape[1]:
        return 1.0

    top_k = np.argpartition(probabilities, kth=probabilities.shape[1] - k, axis=1)[:, -k:]
    return float(np.mean(np.any(top_k == labels[:, None], axis=1)))

validate_probability_inputs(probabilities, labels=None, *, require_normalized=True, normalization_atol=1e-06)

Validate and coerce probability-matrix inputs used by scoring metrics.

Parameters:

Name Type Description Default
probabilities ndarray

Array-like object with shape (n_samples, n_classes).

required
labels ndarray | None

Optional integer class labels of shape (n_samples,).

None
require_normalized bool

If true, each probability row must sum to one within normalization_atol.

True
normalization_atol float

Absolute tolerance for row-sum checks.

1e-06
Source code in src/neureptrace/metrics/__init__.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def validate_probability_inputs(
    probabilities: np.ndarray,
    labels: np.ndarray | None = None,
    *,
    require_normalized: bool = True,
    normalization_atol: float = 1e-6,
) -> tuple[np.ndarray, np.ndarray | None]:
    """Validate and coerce probability-matrix inputs used by scoring metrics.

    Parameters
    ----------
    probabilities:
        Array-like object with shape ``(n_samples, n_classes)``.
    labels:
        Optional integer class labels of shape ``(n_samples,)``.
    require_normalized:
        If true, each probability row must sum to one within
        ``normalization_atol``.
    normalization_atol:
        Absolute tolerance for row-sum checks.
    """
    probabilities = np.asarray(probabilities, dtype=float)
    if probabilities.ndim != 2:
        raise ValueError("probabilities must have shape (n_samples, n_classes)")
    if probabilities.shape[0] == 0 or probabilities.shape[1] == 0:
        raise ValueError("probabilities must contain at least one sample and one class")
    if not np.all(np.isfinite(probabilities)):
        raise ValueError("probabilities must contain only finite values")
    if np.any(probabilities < -normalization_atol):
        raise ValueError("probabilities must be non-negative")

    row_sums = probabilities.sum(axis=1)
    if require_normalized and not np.allclose(row_sums, 1.0, atol=normalization_atol, rtol=0.0):
        raise ValueError("probability rows must sum to one")

    if labels is None:
        return probabilities, None

    labels = np.asarray(labels)
    if labels.ndim != 1:
        raise ValueError("labels must have shape (n_samples,)")
    if probabilities.shape[0] != labels.shape[0]:
        raise ValueError("probabilities and labels must contain the same samples")
    if not np.issubdtype(labels.dtype, np.integer):
        if not np.all(np.equal(labels, np.asarray(labels, dtype=int))):
            raise ValueError("labels must contain integer class indices")
        labels = labels.astype(int)
    if np.any(labels < 0) or np.any(labels >= probabilities.shape[1]):
        raise ValueError("labels must be valid column indices for probabilities")
    return probabilities, labels.astype(int, copy=False)

validate_sample_weight(sample_weight, n_samples)

Return validated non-negative per-sample weights.

Parameters:

Name Type Description Default
sample_weight Iterable[float] | ndarray

One-dimensional non-negative weights.

required
n_samples int

Expected number of samples.

required
Source code in src/neureptrace/metrics/weighted.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def validate_sample_weight(sample_weight: Iterable[float] | np.ndarray, n_samples: int) -> np.ndarray:
    """Return validated non-negative per-sample weights.

    Parameters
    ----------
    sample_weight:
        One-dimensional non-negative weights.
    n_samples:
        Expected number of samples.
    """
    weights = np.asarray(sample_weight, dtype=float)
    if weights.ndim != 1:
        raise ValueError("sample_weight must have shape (n_samples,)")
    if weights.shape[0] != n_samples:
        raise ValueError("sample_weight and probabilities must contain the same samples")
    if not np.all(np.isfinite(weights)):
        raise ValueError("sample_weight must contain only finite values")
    if np.any(weights < 0.0):
        raise ValueError("sample_weight must be non-negative")
    if float(np.sum(weights)) <= 0.0:
        raise ValueError("sample_weight must have positive total weight")
    return weights

weighted_brier_score_multiclass(probabilities, labels, sample_weight)

Compute a weighted multiclass Brier score using one-hot targets.

Source code in src/neureptrace/metrics/weighted.py
72
73
74
75
76
77
78
79
80
81
82
83
84
def weighted_brier_score_multiclass(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
) -> float:
    """Compute a weighted multiclass Brier score using one-hot targets."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])

    targets = np.zeros_like(probabilities, dtype=float)
    targets[np.arange(labels.shape[0]), labels] = 1.0
    losses = np.sum((probabilities - targets) ** 2, axis=1)
    return float(np.average(losses, weights=weights))

weighted_expected_calibration_error(probabilities, labels, sample_weight, *, n_bins=10)

Compute weighted top-label expected calibration error.

Source code in src/neureptrace/metrics/weighted.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def weighted_expected_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    n_bins: int = 10,
) -> float:
    """Compute weighted top-label expected calibration error."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    n_bins = _validate_n_bins(n_bins)

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels
    total_weight = float(np.sum(weights))

    ece = 0.0
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for left, right in zip(edges[:-1], edges[1:]):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        if not np.any(in_bin):
            continue
        bin_weights = weights[in_bin]
        bin_weight_sum = float(np.sum(bin_weights))
        if bin_weight_sum <= 0.0:
            continue
        bin_accuracy = float(np.average(correct[in_bin].astype(float), weights=bin_weights))
        bin_confidence = float(np.average(confidences[in_bin], weights=bin_weights))
        ece += (bin_weight_sum / total_weight) * abs(bin_accuracy - bin_confidence)
    return float(ece)

weighted_negative_log_likelihood(probabilities, labels, sample_weight, *, eps=1e-15)

Compute weighted mean categorical negative log-likelihood.

Source code in src/neureptrace/metrics/weighted.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def weighted_negative_log_likelihood(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    eps: float = 1e-15,
) -> float:
    """Compute weighted mean categorical negative log-likelihood."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    eps = float(eps)
    if not np.isfinite(eps) or eps <= 0.0:
        raise ValueError("eps must be a positive finite value")

    true_probabilities = probabilities[np.arange(labels.shape[0]), labels]
    losses = -np.log(np.clip(true_probabilities, eps, 1.0))
    return float(np.average(losses, weights=weights))

weighted_reliability_bins(probabilities, labels, sample_weight, *, n_bins=10)

Summarize weighted top-label reliability bins for calibration plots.

The returned rows keep the unweighted reliability_bins schema and add sample_weight plus sample_weight_fraction so downstream reports can display both raw-bin occupancy and the effective contribution of each bin.

Source code in src/neureptrace/metrics/weighted.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def weighted_reliability_bins(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    n_bins: int = 10,
) -> list[dict[str, float | int]]:
    """Summarize weighted top-label reliability bins for calibration plots.

    The returned rows keep the unweighted ``reliability_bins`` schema and add
    ``sample_weight`` plus ``sample_weight_fraction`` so downstream reports can
    display both raw-bin occupancy and the effective contribution of each bin.
    """
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    n_bins = _validate_n_bins(n_bins)

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels
    total_weight = float(np.sum(weights))

    rows: list[dict[str, float | int]] = []
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for bin_index, (left, right) in enumerate(zip(edges[:-1], edges[1:])):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        n_samples = int(np.sum(in_bin))
        bin_weight_sum = float(np.sum(weights[in_bin])) if n_samples else 0.0
        if n_samples and bin_weight_sum > 0.0:
            bin_weights = weights[in_bin]
            accuracy = float(np.average(correct[in_bin].astype(float), weights=bin_weights))
            confidence = float(np.average(confidences[in_bin], weights=bin_weights))
            gap = accuracy - confidence
        else:
            accuracy = float("nan")
            confidence = float("nan")
            gap = float("nan")
        rows.append(
            {
                "bin": bin_index,
                "bin_left": float(left),
                "bin_right": float(right),
                "n_samples": n_samples,
                "sample_weight": bin_weight_sum,
                "sample_weight_fraction": bin_weight_sum / total_weight,
                "accuracy": accuracy,
                "confidence": confidence,
                "gap": gap,
            }
        )
    return rows

weighted_top_k_accuracy(probabilities, labels, sample_weight, *, k=1)

Compute weighted top-k classification accuracy.

Source code in src/neureptrace/metrics/weighted.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def weighted_top_k_accuracy(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    k: int = 1,
) -> float:
    """Compute weighted top-k classification accuracy."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    k = int(k)
    if k < 1:
        raise ValueError("k must be positive")
    if k >= probabilities.shape[1]:
        return 1.0

    top_k = np.argpartition(probabilities, kth=probabilities.shape[1] - k, axis=1)[:, -k:]
    correct = np.any(top_k == labels[:, None], axis=1).astype(float)
    return float(np.average(correct, weights=weights))

Pre/Post Windows

neureptrace.metrics.prepost

compare_prepost_windows(frame, metric_column, pre_window, post_window, time_column='time', group_columns=())

Compare a metric between inclusive pre and post time windows.

Source code in src/neureptrace/metrics/prepost.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def compare_prepost_windows(
    frame: pd.DataFrame,
    metric_column: str,
    pre_window: Window,
    post_window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Compare a metric between inclusive pre and post time windows."""
    group_columns = _normalize_columns(group_columns)
    pre_summary = summarize_window_metric(frame, metric_column, pre_window, time_column=time_column, group_columns=group_columns)
    post_summary = summarize_window_metric(frame, metric_column, post_window, time_column=time_column, group_columns=group_columns)

    pre_summary = pre_summary.rename(
        columns={
            "window_start": "pre_window_start",
            "window_stop": "pre_window_stop",
            "n_rows": "n_pre_rows",
            f"{metric_column}_mean": f"{metric_column}_pre_mean",
            f"{metric_column}_std": f"{metric_column}_pre_std",
            f"{metric_column}_sem": f"{metric_column}_pre_sem",
        }
    )
    post_summary = post_summary.rename(
        columns={
            "window_start": "post_window_start",
            "window_stop": "post_window_stop",
            "n_rows": "n_post_rows",
            f"{metric_column}_mean": f"{metric_column}_post_mean",
            f"{metric_column}_std": f"{metric_column}_post_std",
            f"{metric_column}_sem": f"{metric_column}_post_sem",
        }
    )

    if group_columns:
        merged = pre_summary.merge(post_summary, on=group_columns, how="outer")
    else:
        merged = pd.concat([pre_summary.reset_index(drop=True), post_summary.reset_index(drop=True)], axis=1)
    merged[f"{metric_column}_post_minus_pre"] = merged[f"{metric_column}_post_mean"] - merged[f"{metric_column}_pre_mean"]
    return _sorted_frame(merged.to_dict("records"), group_columns)

summarize_window_metric(frame, metric_column, window, time_column='time', group_columns=())

Summarize one metric inside an inclusive time window.

Source code in src/neureptrace/metrics/prepost.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def summarize_window_metric(
    frame: pd.DataFrame,
    metric_column: str,
    window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one metric inside an inclusive time window."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [time_column, metric_column, *group_columns])
    window_start, window_stop = _validate_window(window)

    window_frame = frame.loc[(frame[time_column] >= window_start) & (frame[time_column] <= window_stop)]
    if window_frame.empty:
        raise ValueError(f"No rows fall inside window [{window_start}, {window_stop}].")

    rows: list[dict[str, object]] = []
    for group_key, group in _iter_groups(window_frame, group_columns):
        row = _group_row(group_columns, group_key)
        values = pd.to_numeric(group[metric_column], errors="coerce")
        row.update(
            {
                "window_start": window_start,
                "window_stop": window_stop,
                "n_rows": int(values.notna().sum()),
                f"{metric_column}_mean": _float_or_nan(values.mean()),
                f"{metric_column}_std": _float_or_nan(values.std()),
                f"{metric_column}_sem": _float_or_nan(values.sem()),
            }
        )
        rows.append(row)

    return _sorted_frame(rows, group_columns)

Confusion Tables

neureptrace.metrics.confusion

confusion_category_enrichment(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, n_permutations=10000, seed=0)

Test whether off-diagonal errors stay within label metadata categories.

Source code in src/neureptrace/metrics/confusion.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def confusion_category_enrichment(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    n_permutations: int | None = 10_000,
    seed: int | None = 0,
) -> pd.DataFrame:
    """Test whether off-diagonal errors stay within label metadata categories."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_enrichment_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
                n_permutations=n_permutations,
                seed=seed,
            )
        )
    return pd.DataFrame(rows).reset_index(drop=True)

confusion_category_matrix(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS)

Summarize directional category-to-category error counts and lifts.

Source code in src/neureptrace/metrics/confusion.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def confusion_category_matrix(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
) -> pd.DataFrame:
    """Summarize directional category-to-category error counts and lifts."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_matrix_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
            )
        )

    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -float(row["category_confusion_lift"]) if np.isfinite(float(row["category_confusion_lift"])) else np.inf,
            -int(row["count"]),
            str(row["category_column"]),
            str(row["true_category"]),
            str(row["predicted_category"]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

confusion_counts(frame, true_column='true_label', predicted_column='predicted_label', group_columns=())

Count true/predicted label pairs in a trial-level prediction table.

Source code in src/neureptrace/metrics/confusion.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def confusion_counts(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Count true/predicted label pairs in a trial-level prediction table."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [true_column, predicted_column, *group_columns])

    working = frame[[*group_columns, true_column, predicted_column]].rename(
        columns={true_column: "true_label", predicted_column: "predicted_label"}
    )
    keys = [*group_columns, "true_label", "predicted_label"]
    counts = working.groupby(keys, dropna=False, sort=True).size().reset_index(name="count")
    return counts.reset_index(drop=True)

confusion_pair_summary(frame, true_column='true_label', predicted_column='predicted_label', group_columns=(), participant_column=None, metadata_frame=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, label_prefix='label')

Summarize off-diagonal errors as unordered, bidirectional label pairs.

Expected counts preserve the true-label and predicted-label error marginals. Metadata columns, when supplied, are copied for both labels and get an additional same_<metadata_column> flag when both sides are known.

Source code in src/neureptrace/metrics/confusion.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def confusion_pair_summary(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_frame: pd.DataFrame | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    label_prefix: str = "label",
) -> pd.DataFrame:
    """Summarize off-diagonal errors as unordered, bidirectional label pairs.

    Expected counts preserve the true-label and predicted-label error marginals.
    Metadata columns, when supplied, are copied for both labels and get an
    additional ``same_<metadata_column>`` flag when both sides are known.
    """
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_confusion_pairs_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                metadata_label_columns,
                label_prefix,
            )
        )

    label_a_column = f"{label_prefix}_a"
    label_b_column = f"{label_prefix}_b"
    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -int(row["total_confusions"]),
            -float(row["mean_directional_rate"]) if np.isfinite(float(row["mean_directional_rate"])) else np.inf,
            _label_sort_key(row[label_a_column]),
            _label_sort_key(row[label_b_column]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

per_class_accuracy(frame, true_column='true_label', predicted_column='predicted_label', participant_column=None, group_columns=())

Summarize one-vs-rest recall/accuracy for each true class.

Source code in src/neureptrace/metrics/confusion.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def per_class_accuracy(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    participant_column: str | None = None,
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one-vs-rest recall/accuracy for each true class."""
    group_columns = _normalize_columns(group_columns)
    required_columns = [true_column, predicted_column, *group_columns]
    if participant_column is not None:
        required_columns.append(participant_column)
    _require_columns(frame, required_columns)

    working_columns = [*group_columns, true_column, predicted_column]
    if participant_column is not None:
        working_columns.append(participant_column)
    working = frame[working_columns].rename(columns={true_column: "true_label", predicted_column: "predicted_label"})
    working["_correct"] = working["true_label"] == working["predicted_label"]

    rows: list[dict[str, object]] = []
    keys = [*group_columns, "true_label"]
    for group_key, group in working.groupby(keys, dropna=False, sort=True):
        row = _group_row(keys, group_key)
        row.update(
            {
                "n_trials": int(len(group)),
                "n_correct": int(group["_correct"].sum()),
                "accuracy": float(group["_correct"].mean()),
            }
        )
        if participant_column is not None:
            row["n_participants"] = int(group[participant_column].nunique(dropna=True))
        rows.append(row)

    return pd.DataFrame(rows).reset_index(drop=True)