Metrics

`neureptrace.metrics`

`brier_score_multiclass(probabilities, labels)`

Compute multiclass Brier score using one-hot targets.

Source code in src/neureptrace/metrics/__init__.py

def brier_score_multiclass(probabilities: np.ndarray, labels: np.ndarray) -> float:
    """Compute multiclass Brier score using one-hot targets."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None

    targets = np.zeros_like(probabilities, dtype=float)
    targets[np.arange(labels.shape[0]), labels] = 1.0
    return float(np.mean(np.sum((probabilities - targets) ** 2, axis=1)))

`compare_prepost_windows(frame, metric_column, pre_window, post_window, time_column='time', group_columns=())`

Compare a metric between inclusive pre and post time windows.

Source code in src/neureptrace/metrics/prepost.py

def compare_prepost_windows(
    frame: pd.DataFrame,
    metric_column: str,
    pre_window: Window,
    post_window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Compare a metric between inclusive pre and post time windows."""
    group_columns = _normalize_columns(group_columns)
    _validate_group_output_columns(
        group_columns,
        _comparison_output_columns(metric_column),
        operation="comparison",
    )
    pre_summary = summarize_window_metric(frame, metric_column, pre_window, time_column=time_column, group_columns=group_columns)
    post_summary = summarize_window_metric(frame, metric_column, post_window, time_column=time_column, group_columns=group_columns)

    pre_summary = pre_summary.rename(
        columns={
            "window_start": "pre_window_start",
            "window_stop": "pre_window_stop",
            "n_rows": "n_pre_rows",
            f"{metric_column}_mean": f"{metric_column}_pre_mean",
            f"{metric_column}_std": f"{metric_column}_pre_std",
            f"{metric_column}_sem": f"{metric_column}_pre_sem",
        }
    )
    post_summary = post_summary.rename(
        columns={
            "window_start": "post_window_start",
            "window_stop": "post_window_stop",
            "n_rows": "n_post_rows",
            f"{metric_column}_mean": f"{metric_column}_post_mean",
            f"{metric_column}_std": f"{metric_column}_post_std",
            f"{metric_column}_sem": f"{metric_column}_post_sem",
        }
    )

    if group_columns:
        merged = pre_summary.merge(post_summary, on=group_columns, how="outer")
    else:
        merged = pd.concat([pre_summary.reset_index(drop=True), post_summary.reset_index(drop=True)], axis=1)
    merged[f"{metric_column}_post_minus_pre"] = merged[f"{metric_column}_post_mean"] - merged[f"{metric_column}_pre_mean"]
    return _sorted_frame(merged.to_dict("records"), group_columns)

`confusion_category_enrichment(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, n_permutations=10000, seed=0)`

Test whether off-diagonal errors stay within label metadata categories.

Source code in src/neureptrace/metrics/confusion.py

def confusion_category_enrichment(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    n_permutations: int | None = 10_000,
    seed: int | None = 0,
) -> pd.DataFrame:
    """Test whether off-diagonal errors stay within label metadata categories."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_enrichment_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
                n_permutations=n_permutations,
                seed=seed,
            )
        )
    return pd.DataFrame(rows).reset_index(drop=True)

`confusion_category_matrix(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS)`

Summarize directional category-to-category error counts and lifts.

Source code in src/neureptrace/metrics/confusion.py

def confusion_category_matrix(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
) -> pd.DataFrame:
    """Summarize directional category-to-category error counts and lifts."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_matrix_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
            )
        )

    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -float(row["category_confusion_lift"]) if np.isfinite(float(row["category_confusion_lift"])) else np.inf,
            -int(row["count"]),
            str(row["category_column"]),
            str(row["true_category"]),
            str(row["predicted_category"]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

`confusion_counts(frame, true_column='true_label', predicted_column='predicted_label', group_columns=())`

Count true/predicted label pairs in a trial-level prediction table.

Source code in src/neureptrace/metrics/confusion.py

def confusion_counts(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Count true/predicted label pairs in a trial-level prediction table."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [true_column, predicted_column, *group_columns])

    working = frame[[*group_columns, true_column, predicted_column]].rename(
        columns={true_column: "true_label", predicted_column: "predicted_label"}
    )
    keys = [*group_columns, "true_label", "predicted_label"]
    counts = working.groupby(keys, dropna=False, sort=True).size().reset_index(name="count")
    return counts.reset_index(drop=True)

`confusion_pair_summary(frame, true_column='true_label', predicted_column='predicted_label', group_columns=(), participant_column=None, metadata_frame=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, label_prefix='label')`

Summarize off-diagonal errors as unordered, bidirectional label pairs.

Expected counts preserve the true-label and predicted-label error marginals. Metadata columns, when supplied, are copied for both labels and get an additional same_<metadata_column> flag when both sides are known.

Source code in src/neureptrace/metrics/confusion.py

def confusion_pair_summary(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_frame: pd.DataFrame | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    label_prefix: str = "label",
) -> pd.DataFrame:
    """Summarize off-diagonal errors as unordered, bidirectional label pairs.

    Expected counts preserve the true-label and predicted-label error marginals.
    Metadata columns, when supplied, are copied for both labels and get an
    additional ``same_<metadata_column>`` flag when both sides are known.
    """
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_confusion_pairs_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                metadata_label_columns,
                label_prefix,
            )
        )

    label_a_column = f"{label_prefix}_a"
    label_b_column = f"{label_prefix}_b"
    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -int(row["total_confusions"]),
            -float(row["mean_directional_rate"]) if np.isfinite(float(row["mean_directional_rate"])) else np.inf,
            _label_sort_key(row[label_a_column]),
            _label_sort_key(row[label_b_column]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

`expected_calibration_error(probabilities, labels, *, n_bins=10)`

Compute top-label expected calibration error.

Source code in src/neureptrace/metrics/__init__.py

def expected_calibration_error(probabilities: np.ndarray, labels: np.ndarray, *, n_bins: int = 10) -> float:
    """Compute top-label expected calibration error."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    n_bins = _validate_positive_integer(n_bins, "n_bins")

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels

    edges = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for left, right in zip(edges[:-1], edges[1:]):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        if not np.any(in_bin):
            continue
        bin_weight = np.mean(in_bin)
        bin_accuracy = np.mean(correct[in_bin])
        bin_confidence = np.mean(confidences[in_bin])
        ece += bin_weight * abs(bin_accuracy - bin_confidence)
    return float(ece)

`negative_log_likelihood(probabilities, labels, *, eps=1e-15)`

Compute mean categorical negative log-likelihood from probabilities.

Source code in src/neureptrace/metrics/__init__.py

def negative_log_likelihood(probabilities: np.ndarray, labels: np.ndarray, *, eps: float = 1e-15) -> float:
    """Compute mean categorical negative log-likelihood from probabilities."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    eps = _validate_probability_floor(eps, "eps")

    true_probabilities = probabilities[np.arange(labels.shape[0]), labels]
    return float(-np.mean(np.log(np.clip(true_probabilities, eps, 1.0))))

`per_class_accuracy(frame, true_column='true_label', predicted_column='predicted_label', participant_column=None, group_columns=())`

Summarize one-vs-rest recall/accuracy for each true class.

Source code in src/neureptrace/metrics/confusion.py

def per_class_accuracy(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    participant_column: str | None = None,
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one-vs-rest recall/accuracy for each true class."""
    group_columns = _normalize_columns(group_columns)
    required_columns = [true_column, predicted_column, *group_columns]
    if participant_column is not None:
        required_columns.append(participant_column)
    _require_columns(frame, required_columns)

    working_columns = [*group_columns, true_column, predicted_column]
    if participant_column is not None:
        working_columns.append(participant_column)
    working = frame[working_columns].rename(columns={true_column: "true_label", predicted_column: "predicted_label"})
    working["_correct"] = working["true_label"] == working["predicted_label"]

    rows: list[dict[str, object]] = []
    keys = [*group_columns, "true_label"]
    for group_key, group in working.groupby(keys, dropna=False, sort=True):
        row = _group_row(keys, group_key)
        row.update(
            {
                "n_trials": int(len(group)),
                "n_correct": int(group["_correct"].sum()),
                "accuracy": float(group["_correct"].mean()),
            }
        )
        if participant_column is not None:
            row["n_participants"] = int(group[participant_column].nunique(dropna=True))
        rows.append(row)

    return pd.DataFrame(rows).reset_index(drop=True)

`rank_class_scores(scores, classes, y_true, *, top_k=(2, 3), row_top_k=3, class_column='class')`

Rank true labels in a per-class score matrix and compute top-k metrics.

Missing true labels are counted as top-k failures but are excluded from the finite mean/median rank. If no class-score columns are available, top-k and rank summaries are undefined and returned as NaN.

Source code in src/neureptrace/metrics/ranking.py

def rank_class_scores(
    scores: Sequence[Sequence[float]] | np.ndarray | None,
    classes: Sequence | np.ndarray | None,
    y_true: Sequence | np.ndarray,
    *,
    top_k: Sequence[int] = (2, 3),
    row_top_k: int = 3,
    class_column: str = "class",
) -> dict[str, object]:
    """Rank true labels in a per-class score matrix and compute top-k metrics.

    Missing true labels are counted as top-k failures but are excluded from the
    finite mean/median rank. If no class-score columns are available, top-k and
    rank summaries are undefined and returned as ``NaN``.
    """

    classes = _materialize_reusable_label_input(classes)
    y_true = _materialize_reusable_label_input(y_true)

    if classes is not None and _has_incompatible_array_label_shape(y_true, classes):
        raise ValueError("y_true must be one-dimensional.")
    y_true = _label_vector(y_true, name="y_true")
    top_k = tuple(_validate_integer(k, name="top_k", minimum=1) for k in top_k)
    row_top_k = _validate_integer(row_top_k, name="row_top_k", minimum=0)
    class_column = _validate_class_column_name(class_column)

    if scores is None or classes is None:
        return _empty_class_rank_result(y_true, top_k)

    scores = _materialize_reusable_score_input(scores)
    if _scores_contain_boolean(scores):
        raise ValueError("scores must contain numeric score values, not boolean flags.")
    if _scores_contain_complex(scores):
        raise ValueError("scores must contain real-valued scores, not complex values.")
    try:
        score_matrix = np.asarray(scores, dtype=float)
    except (TypeError, ValueError) as exc:
        raise ValueError("scores must be a two-dimensional matrix.") from exc
    if score_matrix.ndim != 2:
        raise ValueError("scores must be a two-dimensional matrix.")
    if _has_incompatible_class_matrix(classes, expected_n_classes=score_matrix.shape[1]):
        raise ValueError("classes must be one-dimensional.")
    class_order = _label_vector(classes, name="classes")
    if score_matrix.shape[0] != y_true.shape[0]:
        raise ValueError("scores and y_true must contain the same samples.")
    if score_matrix.shape[1] != class_order.size:
        raise ValueError("scores columns must match classes.")
    if not np.all(np.isfinite(score_matrix)):
        raise ValueError("scores must contain only finite values.")
    duplicate_class = _find_duplicate_class_label(class_order)
    if duplicate_class is not None:
        raise ValueError(f"classes must be unique; duplicate label {duplicate_class!r} found.")
    if score_matrix.shape[1] == 0:
        return _empty_class_rank_result(y_true, top_k)

    order = np.argsort(-score_matrix, axis=1, kind="mergesort")
    top_hits = {k: [] for k in top_k}
    ranks: list[float] = []
    rows: list[dict[str, object]] = []
    for sample_index, truth in enumerate(y_true):
        ranked = class_order[order[sample_index]]
        match = _matching_class_positions(ranked, truth)
        for k in top_k:
            top_hits[k].append(bool(match.size and match[0] < k))
        rank = float(match[0] + 1) if match.size else np.nan
        ranks.append(rank)
        row: dict[str, object] = {"true_label_rank": rank, "true_label_score": np.nan}
        true_index = _matching_class_positions(class_order, truth)
        if true_index.size:
            row["true_label_score"] = float(score_matrix[sample_index, true_index[0]])
        for position, class_index in enumerate(order[sample_index, :row_top_k], start=1):
            row[f"rank{position}_{class_column}"] = _as_python_scalar(class_order[class_index])
            row[f"rank{position}_score"] = float(score_matrix[sample_index, class_index])
        rows.append(row)

    true_label_ranks = np.asarray(ranks, dtype=float)
    return {
        "top_k_accuracy": {k: float(np.mean(top_hits[k])) for k in top_k},
        "true_label_ranks": true_label_ranks,
        "mean_true_label_rank": _finite_nanmean(true_label_ranks),
        "median_true_label_rank": _finite_nanmedian(true_label_ranks),
        "rows": rows,
    }

`reliability_bins(probabilities, labels, *, n_bins=10)`

Summarize top-label reliability bins for calibration plots.

Source code in src/neureptrace/metrics/__init__.py

def reliability_bins(probabilities: np.ndarray, labels: np.ndarray, *, n_bins: int = 10) -> list[dict[str, float | int]]:
    """Summarize top-label reliability bins for calibration plots."""
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    n_bins = _validate_positive_integer(n_bins, "n_bins")

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels

    rows: list[dict[str, float | int]] = []
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for bin_index, (left, right) in enumerate(zip(edges[:-1], edges[1:])):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        n_samples = int(np.sum(in_bin))
        if n_samples:
            accuracy = float(np.mean(correct[in_bin]))
            confidence = float(np.mean(confidences[in_bin]))
        else:
            accuracy = float("nan")
            confidence = float("nan")
        rows.append(
            {
                "bin": bin_index,
                "bin_left": float(left),
                "bin_right": float(right),
                "n_samples": n_samples,
                "accuracy": accuracy,
                "confidence": confidence,
                "gap": accuracy - confidence if n_samples else float("nan"),
            }
        )
    return rows

`summarize_window_metric(frame, metric_column, window, time_column='time', group_columns=())`

Summarize one metric inside an inclusive time window.

Source code in src/neureptrace/metrics/prepost.py

def summarize_window_metric(
    frame: pd.DataFrame,
    metric_column: str,
    window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one metric inside an inclusive time window."""
    group_columns = _normalize_columns(group_columns)
    _validate_group_output_columns(
        group_columns,
        _summary_output_columns(metric_column),
        operation="summary",
    )
    _require_columns(frame, [time_column, metric_column, *group_columns])
    window_start, window_stop = _validate_window(window)
    time_values = _finite_numeric_series(frame[time_column], name=time_column)

    window_frame = frame.loc[(time_values >= window_start) & (time_values <= window_stop)]
    if window_frame.empty:
        raise ValueError(f"No rows fall inside window [{window_start}, {window_stop}].")

    rows: list[dict[str, object]] = []
    for group_key, group in _iter_groups(window_frame, group_columns):
        row = _group_row(group_columns, group_key)
        values = _finite_numeric_or_missing_series(group[metric_column], name=metric_column)
        row.update(
            {
                "window_start": window_start,
                "window_stop": window_stop,
                "n_rows": int(values.notna().sum()),
                f"{metric_column}_mean": _float_or_nan(values.mean()),
                f"{metric_column}_std": _float_or_nan(values.std()),
                f"{metric_column}_sem": _float_or_nan(values.sem()),
            }
        )
        rows.append(row)

    return _sorted_frame(rows, group_columns)

`top_k_accuracy(probabilities, labels, *, k=1)`

Compute top-k classification accuracy from probability rows.

Probability ties are resolved deterministically by class-index order. This keeps the selected top-k set size equal to k and prevents uniform or exactly tied probability rows from being counted as correct for every class.

Source code in src/neureptrace/metrics/__init__.py

def top_k_accuracy(probabilities: np.ndarray, labels: np.ndarray, *, k: int = 1) -> float:
    """Compute top-k classification accuracy from probability rows.

    Probability ties are resolved deterministically by class-index order. This
    keeps the selected top-k set size equal to ``k`` and prevents uniform or
    exactly tied probability rows from being counted as correct for every class.
    """
    probabilities, labels = validate_probability_inputs(probabilities, labels)
    assert labels is not None
    k = _validate_positive_integer(k, "k")
    if k >= probabilities.shape[1]:
        return 1.0

    top_k = np.argsort(-probabilities, axis=1, kind="mergesort")[:, :k]
    return float(np.mean(np.any(top_k == labels[:, None], axis=1)))

`validate_probability_inputs(probabilities, labels=None, *, require_normalized=True, normalization_atol=1e-06)`

Validate and coerce probability-matrix inputs used by scoring metrics.

Source code in src/neureptrace/metrics/__init__.py

def validate_probability_inputs(
    probabilities: np.ndarray,
    labels: np.ndarray | None = None,
    *,
    require_normalized: bool = True,
    normalization_atol: float = 1e-6,
) -> tuple[np.ndarray, np.ndarray | None]:
    """Validate and coerce probability-matrix inputs used by scoring metrics."""
    normalization_atol = _validate_non_negative_finite_float(normalization_atol, "normalization_atol")
    probabilities = _materialize_one_pass_iterables(probabilities)
    if _probabilities_contain_boolean(probabilities):
        raise ValueError("probabilities must contain numeric probability values, not boolean flags")
    probabilities = np.asarray(probabilities, dtype=float)
    if probabilities.ndim != 2:
        raise ValueError("probabilities must have shape (n_samples, n_classes)")
    if probabilities.shape[0] == 0 or probabilities.shape[1] == 0:
        raise ValueError("probabilities must contain at least one sample and one class")
    if not np.all(np.isfinite(probabilities)):
        raise ValueError("probabilities must contain only finite values")
    if np.any(probabilities < -normalization_atol):
        raise ValueError("probabilities must be non-negative")
    if np.any(probabilities < 0.0):
        probabilities = np.maximum(probabilities, 0.0)

    if require_normalized:
        row_sums = probabilities.sum(axis=1)
        if not np.allclose(row_sums, 1.0, atol=normalization_atol, rtol=0.0):
            raise ValueError("probability rows must sum to one")
        probabilities = probabilities / row_sums[:, None]

    if labels is None:
        return probabilities, None

    labels = np.asarray(_materialize_one_pass_iterables(labels))
    if labels.ndim == 2 and labels.shape[1] == 1:
        labels = labels.reshape(-1)
    if labels.ndim != 1:
        raise ValueError("labels must have shape (n_samples,)")
    if probabilities.shape[0] != labels.shape[0]:
        raise ValueError("probabilities and labels must contain the same samples")
    labels = _coerce_label_indices(labels)
    if np.any(labels < 0) or np.any(labels >= probabilities.shape[1]):
        raise ValueError("labels must be valid column indices for probabilities")
    return probabilities, labels

`validate_sample_weight(sample_weight, n_samples)`

Return validated non-negative per-sample weights.

Parameters:

Name	Type	Description	Default
`sample_weight`	`Iterable[float] \| ndarray`	One-dimensional non-negative weights.	required
`n_samples`	`int`	Expected number of samples.	required

Source code in src/neureptrace/metrics/weighted.py

def validate_sample_weight(sample_weight: Iterable[float] | np.ndarray, n_samples: int) -> np.ndarray:
    """Return validated non-negative per-sample weights.

    Parameters
    ----------
    sample_weight:
        One-dimensional non-negative weights.
    n_samples:
        Expected number of samples.
    """
    try:
        raw_weights = _sample_weight_array(sample_weight)
    except (TypeError, ValueError) as exc:
        raise ValueError("sample_weight must have shape (n_samples,)") from exc
    if _weights_contain_boolean(raw_weights):
        raise ValueError("sample_weight must contain numeric weights, not boolean values")
    try:
        weights = raw_weights.astype(float, copy=False)
    except (TypeError, ValueError) as exc:
        raise ValueError("sample_weight must contain numeric weights") from exc
    if weights.ndim == 2 and weights.shape[1] == 1:
        weights = weights.reshape(-1)
    if weights.ndim != 1:
        raise ValueError("sample_weight must have shape (n_samples,)")
    if weights.shape[0] != n_samples:
        raise ValueError("sample_weight and probabilities must contain the same samples")
    if not np.all(np.isfinite(weights)):
        raise ValueError("sample_weight must contain only finite values")
    if np.any(weights < 0.0):
        raise ValueError("sample_weight must be non-negative")
    if float(np.sum(weights)) <= 0.0:
        raise ValueError("sample_weight must have positive total weight")
    return weights

`weighted_brier_score_multiclass(probabilities, labels, sample_weight)`

Compute a weighted multiclass Brier score using one-hot targets.

Source code in src/neureptrace/metrics/weighted.py

def weighted_brier_score_multiclass(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
) -> float:
    """Compute a weighted multiclass Brier score using one-hot targets."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])

    targets = np.zeros_like(probabilities, dtype=float)
    targets[np.arange(labels.shape[0]), labels] = 1.0
    losses = np.sum((probabilities - targets) ** 2, axis=1)
    return float(np.average(losses, weights=weights))

`weighted_expected_calibration_error(probabilities, labels, sample_weight, *, n_bins=10)`

Compute weighted top-label expected calibration error.

Source code in src/neureptrace/metrics/weighted.py

def weighted_expected_calibration_error(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    n_bins: int = 10,
) -> float:
    """Compute weighted top-label expected calibration error."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    n_bins = _validate_n_bins(n_bins)

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels
    total_weight = float(np.sum(weights))

    ece = 0.0
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for left, right in zip(edges[:-1], edges[1:]):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        if not np.any(in_bin):
            continue
        bin_weights = weights[in_bin]
        bin_weight_sum = float(np.sum(bin_weights))
        if bin_weight_sum <= 0.0:
            continue
        bin_accuracy = float(np.average(correct[in_bin].astype(float), weights=bin_weights))
        bin_confidence = float(np.average(confidences[in_bin], weights=bin_weights))
        ece += (bin_weight_sum / total_weight) * abs(bin_accuracy - bin_confidence)
    return float(ece)

`weighted_negative_log_likelihood(probabilities, labels, sample_weight, *, eps=1e-15)`

Compute weighted mean categorical negative log-likelihood.

Source code in src/neureptrace/metrics/weighted.py

def weighted_negative_log_likelihood(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    eps: float = 1e-15,
) -> float:
    """Compute weighted mean categorical negative log-likelihood."""
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    eps_message = "eps must be finite and in the open interval (0, 1)"
    eps = _coerce_numeric_scalar(eps, eps_message)
    if not np.isfinite(eps) or eps <= 0.0 or eps >= 1.0:
        raise ValueError(eps_message)

    true_probabilities = probabilities[np.arange(labels.shape[0]), labels]
    losses = -np.log(np.clip(true_probabilities, eps, 1.0))
    return float(np.average(losses, weights=weights))

`weighted_reliability_bins(probabilities, labels, sample_weight, *, n_bins=10)`

Summarize weighted top-label reliability bins for calibration plots.

The returned rows keep the unweighted reliability_bins schema and add sample_weight plus sample_weight_fraction so downstream reports can display both raw-bin occupancy and the effective contribution of each bin.

Source code in src/neureptrace/metrics/weighted.py

def weighted_reliability_bins(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    n_bins: int = 10,
) -> list[dict[str, float | int]]:
    """Summarize weighted top-label reliability bins for calibration plots.

    The returned rows keep the unweighted ``reliability_bins`` schema and add
    ``sample_weight`` plus ``sample_weight_fraction`` so downstream reports can
    display both raw-bin occupancy and the effective contribution of each bin.
    """
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    n_bins = _validate_n_bins(n_bins)

    predictions = probabilities.argmax(axis=1)
    confidences = probabilities.max(axis=1)
    correct = predictions == labels
    total_weight = float(np.sum(weights))

    rows: list[dict[str, float | int]] = []
    edges = np.linspace(0.0, 1.0, n_bins + 1)
    for bin_index, (left, right) in enumerate(zip(edges[:-1], edges[1:])):
        if right == 1.0:
            in_bin = (confidences >= left) & (confidences <= right)
        else:
            in_bin = (confidences >= left) & (confidences < right)
        n_samples = int(np.sum(in_bin))
        bin_weight_sum = float(np.sum(weights[in_bin])) if n_samples else 0.0
        if n_samples and bin_weight_sum > 0.0:
            bin_weights = weights[in_bin]
            accuracy = float(np.average(correct[in_bin].astype(float), weights=bin_weights))
            confidence = float(np.average(confidences[in_bin], weights=bin_weights))
            gap = accuracy - confidence
        else:
            accuracy = float("nan")
            confidence = float("nan")
            gap = float("nan")
        rows.append(
            {
                "bin": bin_index,
                "bin_left": float(left),
                "bin_right": float(right),
                "n_samples": n_samples,
                "sample_weight": bin_weight_sum,
                "sample_weight_fraction": bin_weight_sum / total_weight,
                "accuracy": accuracy,
                "confidence": confidence,
                "gap": gap,
            }
        )
    return rows

`weighted_top_k_accuracy(probabilities, labels, sample_weight, *, k=1)`

Compute weighted top-k classification accuracy.

Probability ties are resolved deterministically by class-index order. This keeps the selected top-k set size equal to k and prevents uniform or exactly tied probability rows from being counted as correct for every class.

Source code in src/neureptrace/metrics/weighted.py

def weighted_top_k_accuracy(
    probabilities: np.ndarray,
    labels: np.ndarray,
    sample_weight: Iterable[float] | np.ndarray,
    *,
    k: int = 1,
) -> float:
    """Compute weighted top-k classification accuracy.

    Probability ties are resolved deterministically by class-index order. This
    keeps the selected top-k set size equal to ``k`` and prevents uniform or
    exactly tied probability rows from being counted as correct for every class.
    """
    probabilities, labels = _validate_probability_inputs(probabilities, labels)
    weights = validate_sample_weight(sample_weight, probabilities.shape[0])
    k = _validate_k(k)
    if k >= probabilities.shape[1]:
        return 1.0

    top_k = np.argsort(-probabilities, axis=1, kind="mergesort")[:, :k]
    correct = np.any(top_k == labels[:, None], axis=1).astype(float)
    return float(np.average(correct, weights=weights))

Pre/Post Windows

`neureptrace.metrics.prepost`

`compare_prepost_windows(frame, metric_column, pre_window, post_window, time_column='time', group_columns=())`

Compare a metric between inclusive pre and post time windows.

Source code in src/neureptrace/metrics/prepost.py

def compare_prepost_windows(
    frame: pd.DataFrame,
    metric_column: str,
    pre_window: Window,
    post_window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Compare a metric between inclusive pre and post time windows."""
    group_columns = _normalize_columns(group_columns)
    _validate_group_output_columns(
        group_columns,
        _comparison_output_columns(metric_column),
        operation="comparison",
    )
    pre_summary = summarize_window_metric(frame, metric_column, pre_window, time_column=time_column, group_columns=group_columns)
    post_summary = summarize_window_metric(frame, metric_column, post_window, time_column=time_column, group_columns=group_columns)

    pre_summary = pre_summary.rename(
        columns={
            "window_start": "pre_window_start",
            "window_stop": "pre_window_stop",
            "n_rows": "n_pre_rows",
            f"{metric_column}_mean": f"{metric_column}_pre_mean",
            f"{metric_column}_std": f"{metric_column}_pre_std",
            f"{metric_column}_sem": f"{metric_column}_pre_sem",
        }
    )
    post_summary = post_summary.rename(
        columns={
            "window_start": "post_window_start",
            "window_stop": "post_window_stop",
            "n_rows": "n_post_rows",
            f"{metric_column}_mean": f"{metric_column}_post_mean",
            f"{metric_column}_std": f"{metric_column}_post_std",
            f"{metric_column}_sem": f"{metric_column}_post_sem",
        }
    )

    if group_columns:
        merged = pre_summary.merge(post_summary, on=group_columns, how="outer")
    else:
        merged = pd.concat([pre_summary.reset_index(drop=True), post_summary.reset_index(drop=True)], axis=1)
    merged[f"{metric_column}_post_minus_pre"] = merged[f"{metric_column}_post_mean"] - merged[f"{metric_column}_pre_mean"]
    return _sorted_frame(merged.to_dict("records"), group_columns)

`summarize_window_metric(frame, metric_column, window, time_column='time', group_columns=())`

Summarize one metric inside an inclusive time window.

Source code in src/neureptrace/metrics/prepost.py

def summarize_window_metric(
    frame: pd.DataFrame,
    metric_column: str,
    window: Window,
    time_column: str = "time",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one metric inside an inclusive time window."""
    group_columns = _normalize_columns(group_columns)
    _validate_group_output_columns(
        group_columns,
        _summary_output_columns(metric_column),
        operation="summary",
    )
    _require_columns(frame, [time_column, metric_column, *group_columns])
    window_start, window_stop = _validate_window(window)
    time_values = _finite_numeric_series(frame[time_column], name=time_column)

    window_frame = frame.loc[(time_values >= window_start) & (time_values <= window_stop)]
    if window_frame.empty:
        raise ValueError(f"No rows fall inside window [{window_start}, {window_stop}].")

    rows: list[dict[str, object]] = []
    for group_key, group in _iter_groups(window_frame, group_columns):
        row = _group_row(group_columns, group_key)
        values = _finite_numeric_or_missing_series(group[metric_column], name=metric_column)
        row.update(
            {
                "window_start": window_start,
                "window_stop": window_stop,
                "n_rows": int(values.notna().sum()),
                f"{metric_column}_mean": _float_or_nan(values.mean()),
                f"{metric_column}_std": _float_or_nan(values.std()),
                f"{metric_column}_sem": _float_or_nan(values.sem()),
            }
        )
        rows.append(row)

    return _sorted_frame(rows, group_columns)

Confusion Tables

`neureptrace.metrics.confusion`

`confusion_category_enrichment(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, n_permutations=10000, seed=0)`

Test whether off-diagonal errors stay within label metadata categories.

Source code in src/neureptrace/metrics/confusion.py

def confusion_category_enrichment(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    n_permutations: int | None = 10_000,
    seed: int | None = 0,
) -> pd.DataFrame:
    """Test whether off-diagonal errors stay within label metadata categories."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_enrichment_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
                n_permutations=n_permutations,
                seed=seed,
            )
        )
    return pd.DataFrame(rows).reset_index(drop=True)

`confusion_category_matrix(frame, *, metadata_frame, true_column='true_label', predicted_column='predicted_label', category_columns=None, group_columns=(), participant_column=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS)`

Summarize directional category-to-category error counts and lifts.

Source code in src/neureptrace/metrics/confusion.py

def confusion_category_matrix(
    frame: pd.DataFrame,
    *,
    metadata_frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    category_columns: Sequence[str] | str | None = None,
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
) -> pd.DataFrame:
    """Summarize directional category-to-category error counts and lifts."""
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)
    category_columns = _normalize_category_columns(metadata_frame, category_columns, metadata_label_columns)
    if not metadata_by_label or not category_columns:
        return pd.DataFrame()

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_category_matrix_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                category_columns,
            )
        )

    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -float(row["category_confusion_lift"]) if np.isfinite(float(row["category_confusion_lift"])) else np.inf,
            -int(row["count"]),
            str(row["category_column"]),
            str(row["true_category"]),
            str(row["predicted_category"]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

`confusion_counts(frame, true_column='true_label', predicted_column='predicted_label', group_columns=())`

Count true/predicted label pairs in a trial-level prediction table.

Source code in src/neureptrace/metrics/confusion.py

def confusion_counts(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Count true/predicted label pairs in a trial-level prediction table."""
    group_columns = _normalize_columns(group_columns)
    _require_columns(frame, [true_column, predicted_column, *group_columns])

    working = frame[[*group_columns, true_column, predicted_column]].rename(
        columns={true_column: "true_label", predicted_column: "predicted_label"}
    )
    keys = [*group_columns, "true_label", "predicted_label"]
    counts = working.groupby(keys, dropna=False, sort=True).size().reset_index(name="count")
    return counts.reset_index(drop=True)

`confusion_pair_summary(frame, true_column='true_label', predicted_column='predicted_label', group_columns=(), participant_column=None, metadata_frame=None, metadata_label_columns=DEFAULT_METADATA_LABEL_COLUMNS, label_prefix='label')`

Summarize off-diagonal errors as unordered, bidirectional label pairs.

Expected counts preserve the true-label and predicted-label error marginals. Metadata columns, when supplied, are copied for both labels and get an additional same_<metadata_column> flag when both sides are known.

Source code in src/neureptrace/metrics/confusion.py

def confusion_pair_summary(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    group_columns: Sequence[str] = (),
    participant_column: str | None = None,
    metadata_frame: pd.DataFrame | None = None,
    metadata_label_columns: Sequence[str] = DEFAULT_METADATA_LABEL_COLUMNS,
    label_prefix: str = "label",
) -> pd.DataFrame:
    """Summarize off-diagonal errors as unordered, bidirectional label pairs.

    Expected counts preserve the true-label and predicted-label error marginals.
    Metadata columns, when supplied, are copied for both labels and get an
    additional ``same_<metadata_column>`` flag when both sides are known.
    """
    group_columns = _normalize_columns(group_columns)
    working = _prediction_frame(
        frame,
        true_column=true_column,
        predicted_column=predicted_column,
        group_columns=group_columns,
        participant_column=participant_column,
    )
    metadata_by_label = _metadata_by_label(metadata_frame, metadata_label_columns)

    rows: list[dict[str, object]] = []
    for group_key, group_frame in _iter_frame_groups(working, group_columns):
        rows.extend(
            _summarize_confusion_pairs_for_group(
                group_frame,
                _group_row(group_columns, group_key),
                metadata_by_label,
                metadata_label_columns,
                label_prefix,
            )
        )

    label_a_column = f"{label_prefix}_a"
    label_b_column = f"{label_prefix}_b"
    sorted_rows = sorted(
        rows,
        key=lambda row: (
            -int(row["total_confusions"]),
            -float(row["mean_directional_rate"]) if np.isfinite(float(row["mean_directional_rate"])) else np.inf,
            _label_sort_key(row[label_a_column]),
            _label_sort_key(row[label_b_column]),
        ),
    )
    return pd.DataFrame(sorted_rows).reset_index(drop=True)

`per_class_accuracy(frame, true_column='true_label', predicted_column='predicted_label', participant_column=None, group_columns=())`

Summarize one-vs-rest recall/accuracy for each true class.

Source code in src/neureptrace/metrics/confusion.py

def per_class_accuracy(
    frame: pd.DataFrame,
    true_column: str = "true_label",
    predicted_column: str = "predicted_label",
    participant_column: str | None = None,
    group_columns: Sequence[str] = (),
) -> pd.DataFrame:
    """Summarize one-vs-rest recall/accuracy for each true class."""
    group_columns = _normalize_columns(group_columns)
    required_columns = [true_column, predicted_column, *group_columns]
    if participant_column is not None:
        required_columns.append(participant_column)
    _require_columns(frame, required_columns)

    working_columns = [*group_columns, true_column, predicted_column]
    if participant_column is not None:
        working_columns.append(participant_column)
    working = frame[working_columns].rename(columns={true_column: "true_label", predicted_column: "predicted_label"})
    working["_correct"] = working["true_label"] == working["predicted_label"]

    rows: list[dict[str, object]] = []
    keys = [*group_columns, "true_label"]
    for group_key, group in working.groupby(keys, dropna=False, sort=True):
        row = _group_row(keys, group_key)
        row.update(
            {
                "n_trials": int(len(group)),
                "n_correct": int(group["_correct"].sum()),
                "accuracy": float(group["_correct"].mean()),
            }
        )
        if participant_column is not None:
            row["n_participants"] = int(group[participant_column].nunique(dropna=True))
        rows.append(row)

    return pd.DataFrame(rows).reset_index(drop=True)