Semantic Stages

`neureptrace.semantic_stages`

`analyze_semantic_stages(state_trace_csvs, *, posterior_threshold=0.6, match_threshold=0.6, min_duration=0.04, out_time=None, out_stages=None, out_report=None)`

Analyze whether category-conditioned state traces form stable temporal stages.

Source code in src/neureptrace/semantic_stages.py

def analyze_semantic_stages(
    state_trace_csvs: list[Path],
    *,
    posterior_threshold: float = 0.6,
    match_threshold: float = 0.6,
    min_duration: float = 0.04,
    out_time: Path | None = None,
    out_stages: Path | None = None,
    out_report: Path | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
    """Analyze whether category-conditioned state traces form stable temporal stages."""
    posterior_threshold, match_threshold, min_duration = _normalize_stage_controls(
        posterior_threshold=posterior_threshold,
        match_threshold=match_threshold,
        min_duration=min_duration,
    )
    state_traces = read_state_traces(state_trace_csvs)
    time_summary, _ = summarize_category_timecourse(state_traces)
    stages = detect_stable_stages(
        time_summary,
        posterior_threshold=posterior_threshold,
        match_threshold=match_threshold,
        min_duration=min_duration,
    )

    if out_time is not None:
        out_time.parent.mkdir(parents=True, exist_ok=True)
        time_summary.to_csv(out_time, index=False)
    if out_stages is not None:
        out_stages.parent.mkdir(parents=True, exist_ok=True)
        stages.to_csv(out_stages, index=False)

    report = None
    if out_report is not None:
        report = build_stage_report(
            time_summary,
            stages,
            posterior_threshold=posterior_threshold,
            match_threshold=match_threshold,
            min_duration=min_duration,
        )
        out_report.parent.mkdir(parents=True, exist_ok=True)
        out_report.write_text(report, encoding="utf-8")

    return time_summary, stages, report

`build_stage_report(time_summary, stages, *, posterior_threshold, match_threshold, min_duration)`

Build a compact Markdown report for the semantic-stage question.

Source code in src/neureptrace/semantic_stages.py

def build_stage_report(
    time_summary: pd.DataFrame,
    stages: pd.DataFrame,
    *,
    posterior_threshold: float,
    match_threshold: float,
    min_duration: float,
) -> str:
    """Build a compact Markdown report for the semantic-stage question."""
    posterior_threshold, match_threshold, min_duration = _normalize_stage_controls(
        posterior_threshold=posterior_threshold,
        match_threshold=match_threshold,
        min_duration=min_duration,
    )
    lines = [
        "# NeuRepTrace Semantic Stage Report",
        "",
        "Question: do semantic categories unfold in stable temporal stages?",
        "",
        "A stable stage is a contiguous time range where the posterior assigned to",
        "the trial's semantic class and the Viterbi match rate both exceed the",
        "configured thresholds.",
        "",
        "## Thresholds",
        "",
        f"- Posterior threshold: {posterior_threshold:.3f}",
        f"- Viterbi match threshold: {match_threshold:.3f}",
        f"- Minimum duration: {min_duration:.3f} s",
        "",
        "## Stable Stages",
        "",
    ]

    if stages.empty:
        lines.append("No stable semantic stages were detected at these thresholds.")
    else:
        lines.extend(
            [
                "| Decoder | Emission mode | Semantic class | Start (s) | Stop (s) | Duration (s) | Mean posterior | Mean match | Peak time (s) | Subject min |",
                "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
            ]
        )
        for row in stages.itertuples(index=False):
            n_subjects_min = row.n_subjects_min if hasattr(row, "n_subjects_min") else 0
            decoder = _markdown_table_cell(row.decoder)
            emission_mode = _markdown_table_cell(row.emission_mode)
            semantic_class = _markdown_table_cell(row.semantic_class)
            lines.append(
                f"| {decoder} | {emission_mode} | {semantic_class} | {row.start_time:.3f} | {row.stop_time:.3f} | "
                f"{row.duration:.3f} | {row.mean_posterior_true_class:.3f} | {row.mean_viterbi_match_rate:.3f} | {row.peak_time:.3f} | {n_subjects_min} |"
            )

    if not time_summary.empty:
        peaks = (
            time_summary.loc[time_summary.groupby([*_stage_group_columns(time_summary), "true_class"])["posterior_true_class_mean"].idxmax()]
            .sort_values([*_stage_group_columns(time_summary), "true_class"])
            .reset_index(drop=True)
        )
        lines.extend(
            [
                "",
                "## Category Peaks",
                "",
                "| Decoder | Emission mode | Semantic class | Peak time (s) | Peak posterior | Match rate |",
                "| --- | --- | --- | ---: | ---: | ---: |",
            ]
        )
        for row in peaks.itertuples(index=False):
            decoder = _markdown_table_cell(row.decoder)
            emission_mode = _markdown_table_cell(row.emission_mode)
            semantic_class = _markdown_table_cell(row.true_class)
            lines.append(f"| {decoder} | {emission_mode} | {semantic_class} | {row.time:.3f} | {row.posterior_true_class_mean:.3f} | {row.viterbi_match_rate:.3f} |")

    lines.extend(
        [
            "",
            "Interpretation should be paired with the temporal-model controls. A",
            "stage is strongest when it survives these descriptive thresholds and",
            "the parent temporal model beats shuffled-time, shuffled-label, and",
            "baseline-window controls.",
            "",
        ]
    )
    return "\n".join(lines)

`detect_stable_stages(time_summary, *, posterior_threshold=0.6, match_threshold=0.6, min_duration=0.04)`

Detect contiguous semantic stages from a category-conditioned time summary.

Source code in src/neureptrace/semantic_stages.py

def detect_stable_stages(
    time_summary: pd.DataFrame,
    *,
    posterior_threshold: float = 0.6,
    match_threshold: float = 0.6,
    min_duration: float = 0.04,
) -> pd.DataFrame:
    """Detect contiguous semantic stages from a category-conditioned time summary."""
    posterior_threshold, match_threshold, min_duration = _normalize_stage_controls(
        posterior_threshold=posterior_threshold,
        match_threshold=match_threshold,
        min_duration=min_duration,
    )
    required_columns = {"true_class", "time", "posterior_true_class_mean", "viterbi_match_rate", "n_sequences"}
    missing = sorted(required_columns - set(time_summary.columns))
    if missing:
        raise ValueError(f"time_summary is missing required columns: {missing}")
    time_summary = time_summary.copy()
    for column in ("time", "posterior_true_class_mean", "viterbi_match_rate"):
        _coerce_finite_numeric_column(time_summary, column)
    rows = []
    group_columns = [*_stage_group_columns(time_summary), "true_class"]
    for keys, group in time_summary.sort_values("time").groupby(group_columns, sort=True):
        key_values = keys if isinstance(keys, tuple) else (keys,)
        group_values = dict(zip(group_columns, key_values, strict=True))
        stable = (
            (group["posterior_true_class_mean"].to_numpy(dtype=float) >= posterior_threshold)
            & (group["viterbi_match_rate"].to_numpy(dtype=float) >= match_threshold)
        )
        times = group["time"].to_numpy(dtype=float)
        for start_index, stop_index in _contiguous_segments(stable):
            start_time = float(times[start_index])
            stop_time = float(times[stop_index])
            duration = stop_time - start_time
            if duration < min_duration:
                continue
            segment = group.iloc[start_index : stop_index + 1]
            peak_row = segment.loc[segment["posterior_true_class_mean"].idxmax()]
            rows.append(
                {
                    "decoder": group_values.get("decoder", "decoder"),
                    "emission_mode": group_values.get("emission_mode", "calibrated"),
                    "semantic_class": group_values["true_class"],
                    "start_time": start_time,
                    "stop_time": stop_time,
                    "duration": duration,
                    "n_timepoints": len(segment),
                    "mean_posterior_true_class": float(segment["posterior_true_class_mean"].mean()),
                    "mean_viterbi_match_rate": float(segment["viterbi_match_rate"].mean()),
                    "peak_time": float(peak_row["time"]),
                    "peak_posterior_true_class": float(peak_row["posterior_true_class_mean"]),
                    "n_subjects_min": int(segment["n_subjects"].min()) if "n_subjects" in segment.columns else 0,
                    "n_sequences_min": int(segment["n_sequences"].min()),
                }
            )
    return pd.DataFrame(rows)

`posterior_columns(frame)`

Return posterior state columns in state-index order.

Source code in src/neureptrace/semantic_stages.py

def posterior_columns(frame: pd.DataFrame) -> list[str]:
    """Return posterior state columns in state-index order."""
    columns = [column for column in frame.columns if column.startswith("posterior_state_")]
    if not columns:
        raise ValueError("State trace CSVs must contain columns named 'posterior_state_*'.")

    def sort_key(column: str) -> tuple[int, str]:
        suffix = column.removeprefix("posterior_state_")
        return (int(suffix), suffix) if suffix.isdigit() else (10_000, suffix)

    return sorted(columns, key=sort_key)

`read_state_traces(csv_paths)`

Read state traces emitted by neureptrace.temporal_model.

Source code in src/neureptrace/semantic_stages.py

def read_state_traces(csv_paths: list[Path]) -> pd.DataFrame:
    """Read state traces emitted by ``neureptrace.temporal_model``."""
    if not csv_paths:
        raise ValueError("At least one state trace CSV path is required.")

    frames = []
    for csv_path in csv_paths:
        frame = pd.read_csv(csv_path)
        missing = [column for column in ("time", "viterbi_class") if column not in frame.columns]
        if missing:
            raise ValueError(f"{csv_path} is missing required columns: {missing}")
        _coerce_finite_numeric_column(frame, "time", source=csv_path)
        columns = posterior_columns(frame)
        _validate_posterior_frame(frame, columns, source=csv_path)
        if "subject" not in frame.columns:
            frame["subject"] = csv_path.stem
        if "decoder" not in frame.columns:
            frame["decoder"] = "decoder"
        if "emission_mode" not in frame.columns:
            frame["emission_mode"] = "calibrated"
        if "sequence_id" not in frame.columns:
            if "sample_index" in frame.columns:
                frame["sequence_id"] = frame["sample_index"]
            else:
                frame["sequence_id"] = np.arange(len(frame))
        frame["subject"] = frame["subject"].astype(str)
        frame["decoder"] = frame["decoder"].astype(str)
        frame["emission_mode"] = frame["emission_mode"].astype(str)
        if "source_file" not in frame.columns:
            frame["source_file"] = csv_path.name
        else:
            frame["source_file"] = frame["source_file"].fillna(csv_path.name)
        if "source_path" not in frame.columns:
            frame["source_path"] = str(csv_path)
        else:
            frame["source_path"] = frame["source_path"].fillna(str(csv_path))
        frame["source_file"] = frame["source_file"].astype(str)
        frame["source_path"] = frame["source_path"].astype(str)
        frames.append(frame)
    return pd.concat(frames, ignore_index=True)

`summarize_category_timecourse(state_traces)`

Summarize category-conditioned state stability over time.

Category metrics are subject-balanced: trials/sequences are averaged within each subject first, then subject means are averaged at each class/time point.

Source code in src/neureptrace/semantic_stages.py

def summarize_category_timecourse(state_traces: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """Summarize category-conditioned state stability over time.

    Category metrics are subject-balanced: trials/sequences are averaged within
    each subject first, then subject means are averaged at each class/time point.
    """
    columns = posterior_columns(state_traces)
    state_names = _state_names(state_traces, columns)
    if "true_class" not in state_traces.columns:
        return summarize_dominant_timecourse(state_traces), state_names

    aligned = _add_true_class_alignment(state_traces, columns, state_names)
    group_columns = [*_stage_group_columns(aligned), "true_class", "time"]
    summary = _subject_balanced_category_summary(aligned, group_columns)
    summary = summary.sort_values(group_columns).reset_index(drop=True)
    return summary, state_names

`summarize_dominant_timecourse(state_traces)`

Summarize dominant latent-state stability when true category labels are absent.

Source code in src/neureptrace/semantic_stages.py

def summarize_dominant_timecourse(state_traces: pd.DataFrame) -> pd.DataFrame:
    """Summarize dominant latent-state stability when true category labels are absent."""
    columns = posterior_columns(state_traces)
    state_names = _state_names(state_traces, columns)
    frame = state_traces.copy()
    frame["sequence_key"] = _sequence_keys(frame)
    rows = []
    group_columns = _stage_group_columns(frame)
    for keys, group in frame.groupby([*group_columns, "time"], sort=True):
        key_values = keys if isinstance(keys, tuple) else (keys,)
        group_values = dict(zip([*group_columns, "time"], key_values, strict=True))
        subject_posteriors = group.groupby("subject", sort=True)[columns].mean()
        means = subject_posteriors.mean().to_numpy(dtype=float)
        order = np.argsort(means)
        dominant_index = int(order[-1])
        runner_up = float(means[order[-2]]) if len(order) > 1 else float("nan")
        dominant_class = state_names[dominant_index]
        subject_viterbi_match = (
            group.assign(viterbi_matches_dominant=group["viterbi_class"].astype(str) == dominant_class)
            .groupby("subject", sort=True)["viterbi_matches_dominant"]
            .mean()
        )
        if "viterbi_posterior" in group.columns:
            viterbi_posterior_mean = float(group.groupby("subject", sort=True)["viterbi_posterior"].mean().mean())
        else:
            viterbi_posterior_mean = float(means[dominant_index])
        rows.append(
            {
                "decoder": group_values.get("decoder", "decoder"),
                "emission_mode": group_values.get("emission_mode", "calibrated"),
                "true_class": dominant_class,
                "time": float(group_values["time"]),
                "n_observations": len(group),
                "n_subjects": group["subject"].nunique(),
                "n_sequences": group["sequence_key"].nunique(),
                "posterior_true_class_mean": float(means[dominant_index]),
                "posterior_true_class_sem": _time_sem(subject_posteriors.iloc[:, dominant_index]),
                "viterbi_match_rate": float(subject_viterbi_match.mean()),
                "viterbi_posterior_mean": viterbi_posterior_mean,
                "posterior_margin": float(means[dominant_index] - runner_up) if len(order) > 1 else float("nan"),
            }
        )
    result = pd.DataFrame(rows)
    if result.empty:
        return result
    return result.sort_values([*_stage_group_columns(result), "true_class", "time"]).reset_index(drop=True)