Skip to content

Semantic Stages

neureptrace.semantic_stages

analyze_semantic_stages(state_trace_csvs, *, posterior_threshold=0.6, match_threshold=0.6, min_duration=0.04, out_time=None, out_stages=None, out_report=None)

Analyze whether category-conditioned state traces form stable temporal stages.

Source code in src/neureptrace/semantic_stages.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def analyze_semantic_stages(
    state_trace_csvs: list[Path],
    *,
    posterior_threshold: float = 0.6,
    match_threshold: float = 0.6,
    min_duration: float = 0.04,
    out_time: Path | None = None,
    out_stages: Path | None = None,
    out_report: Path | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
    """Analyze whether category-conditioned state traces form stable temporal stages."""
    state_traces = read_state_traces(state_trace_csvs)
    time_summary, _ = summarize_category_timecourse(state_traces)
    stages = detect_stable_stages(
        time_summary,
        posterior_threshold=posterior_threshold,
        match_threshold=match_threshold,
        min_duration=min_duration,
    )

    if out_time is not None:
        out_time.parent.mkdir(parents=True, exist_ok=True)
        time_summary.to_csv(out_time, index=False)
    if out_stages is not None:
        out_stages.parent.mkdir(parents=True, exist_ok=True)
        stages.to_csv(out_stages, index=False)

    report = None
    if out_report is not None:
        report = build_stage_report(
            time_summary,
            stages,
            posterior_threshold=posterior_threshold,
            match_threshold=match_threshold,
            min_duration=min_duration,
        )
        out_report.parent.mkdir(parents=True, exist_ok=True)
        out_report.write_text(report, encoding="utf-8")

    return time_summary, stages, report

build_stage_report(time_summary, stages, *, posterior_threshold, match_threshold, min_duration)

Build a compact Markdown report for the semantic-stage question.

Source code in src/neureptrace/semantic_stages.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def build_stage_report(
    time_summary: pd.DataFrame,
    stages: pd.DataFrame,
    *,
    posterior_threshold: float,
    match_threshold: float,
    min_duration: float,
) -> str:
    """Build a compact Markdown report for the semantic-stage question."""
    lines = [
        "# NeuRepTrace Semantic Stage Report",
        "",
        "Question: do semantic categories unfold in stable temporal stages?",
        "",
        "A stable stage is a contiguous time range where the posterior assigned to",
        "the trial's semantic class and the Viterbi match rate both exceed the",
        "configured thresholds.",
        "",
        "## Thresholds",
        "",
        f"- Posterior threshold: {posterior_threshold:.3f}",
        f"- Viterbi match threshold: {match_threshold:.3f}",
        f"- Minimum duration: {min_duration:.3f} s",
        "",
        "## Stable Stages",
        "",
    ]

    if stages.empty:
        lines.append("No stable semantic stages were detected at these thresholds.")
    else:
        lines.extend(
            [
                "| Decoder | Emission mode | Semantic class | Start (s) | Stop (s) | Duration (s) | Mean posterior | Mean match | Peak time (s) | Subject min |",
                "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
            ]
        )
        for row in stages.itertuples(index=False):
            n_subjects_min = row.n_subjects_min if hasattr(row, "n_subjects_min") else 0
            lines.append(
                f"| {row.decoder} | {row.emission_mode} | {row.semantic_class} | {row.start_time:.3f} | {row.stop_time:.3f} | "
                f"{row.duration:.3f} | {row.mean_posterior_true_class:.3f} | {row.mean_viterbi_match_rate:.3f} | {row.peak_time:.3f} | {n_subjects_min} |"
            )

    if not time_summary.empty:
        peaks = (
            time_summary.loc[time_summary.groupby([*_stage_group_columns(time_summary), "true_class"])["posterior_true_class_mean"].idxmax()]
            .sort_values([*_stage_group_columns(time_summary), "true_class"])
            .reset_index(drop=True)
        )
        lines.extend(
            [
                "",
                "## Category Peaks",
                "",
                "| Decoder | Emission mode | Semantic class | Peak time (s) | Peak posterior | Match rate |",
                "| --- | --- | --- | ---: | ---: | ---: |",
            ]
        )
        for row in peaks.itertuples(index=False):
            lines.append(f"| {row.decoder} | {row.emission_mode} | {row.true_class} | {row.time:.3f} | {row.posterior_true_class_mean:.3f} | {row.viterbi_match_rate:.3f} |")

    lines.extend(
        [
            "",
            "Interpretation should be paired with the temporal-model controls. A",
            "stage is strongest when it survives these descriptive thresholds and",
            "the parent temporal model beats shuffled-time, shuffled-label, and",
            "baseline-window controls.",
            "",
        ]
    )
    return "\n".join(lines)

detect_stable_stages(time_summary, *, posterior_threshold=0.6, match_threshold=0.6, min_duration=0.04)

Detect contiguous semantic stages from a category-conditioned time summary.

Source code in src/neureptrace/semantic_stages.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def detect_stable_stages(
    time_summary: pd.DataFrame,
    *,
    posterior_threshold: float = 0.6,
    match_threshold: float = 0.6,
    min_duration: float = 0.04,
) -> pd.DataFrame:
    """Detect contiguous semantic stages from a category-conditioned time summary."""
    rows = []
    group_columns = [*_stage_group_columns(time_summary), "true_class"]
    for keys, group in time_summary.sort_values("time").groupby(group_columns, sort=True):
        key_values = keys if isinstance(keys, tuple) else (keys,)
        group_values = dict(zip(group_columns, key_values, strict=True))
        stable = (
            (group["posterior_true_class_mean"].to_numpy(dtype=float) >= posterior_threshold)
            & (group["viterbi_match_rate"].to_numpy(dtype=float) >= match_threshold)
        )
        times = group["time"].to_numpy(dtype=float)
        for start_index, stop_index in _contiguous_segments(stable):
            start_time = float(times[start_index])
            stop_time = float(times[stop_index])
            duration = stop_time - start_time
            if duration < min_duration:
                continue
            segment = group.iloc[start_index : stop_index + 1]
            peak_row = segment.loc[segment["posterior_true_class_mean"].idxmax()]
            rows.append(
                {
                    "decoder": group_values.get("decoder", "decoder"),
                    "emission_mode": group_values.get("emission_mode", "calibrated"),
                    "semantic_class": group_values["true_class"],
                    "start_time": start_time,
                    "stop_time": stop_time,
                    "duration": duration,
                    "n_timepoints": len(segment),
                    "mean_posterior_true_class": float(segment["posterior_true_class_mean"].mean()),
                    "mean_viterbi_match_rate": float(segment["viterbi_match_rate"].mean()),
                    "peak_time": float(peak_row["time"]),
                    "peak_posterior_true_class": float(peak_row["posterior_true_class_mean"]),
                    "n_subjects_min": int(segment["n_subjects"].min()) if "n_subjects" in segment.columns else 0,
                    "n_sequences_min": int(segment["n_sequences"].min()),
                }
            )
    return pd.DataFrame(rows)

posterior_columns(frame)

Return posterior state columns in state-index order.

Source code in src/neureptrace/semantic_stages.py
26
27
28
29
30
31
32
33
34
35
36
def posterior_columns(frame: pd.DataFrame) -> list[str]:
    """Return posterior state columns in state-index order."""
    columns = [column for column in frame.columns if column.startswith("posterior_state_")]
    if not columns:
        raise ValueError("State trace CSVs must contain columns named 'posterior_state_*'.")

    def sort_key(column: str) -> tuple[int, str]:
        suffix = column.removeprefix("posterior_state_")
        return (int(suffix), suffix) if suffix.isdigit() else (10_000, suffix)

    return sorted(columns, key=sort_key)

read_state_traces(csv_paths)

Read state traces emitted by neureptrace.temporal_model.

Source code in src/neureptrace/semantic_stages.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def read_state_traces(csv_paths: list[Path]) -> pd.DataFrame:
    """Read state traces emitted by ``neureptrace.temporal_model``."""
    if not csv_paths:
        raise ValueError("At least one state trace CSV path is required.")

    frames = []
    for csv_path in csv_paths:
        frame = pd.read_csv(csv_path)
        missing = [column for column in ("time", "viterbi_class") if column not in frame.columns]
        if missing:
            raise ValueError(f"{csv_path} is missing required columns: {missing}")
        posterior_columns(frame)
        if "subject" not in frame.columns:
            frame["subject"] = csv_path.stem
        if "decoder" not in frame.columns:
            frame["decoder"] = "decoder"
        if "emission_mode" not in frame.columns:
            frame["emission_mode"] = "calibrated"
        if "sequence_id" not in frame.columns:
            if "sample_index" in frame.columns:
                frame["sequence_id"] = frame["sample_index"]
            else:
                frame["sequence_id"] = np.arange(len(frame))
        frame["subject"] = frame["subject"].astype(str)
        frame["decoder"] = frame["decoder"].astype(str)
        frame["emission_mode"] = frame["emission_mode"].astype(str)
        if "source_file" not in frame.columns:
            frame["source_file"] = csv_path.name
        else:
            frame["source_file"] = frame["source_file"].fillna(csv_path.name)
        if "source_path" not in frame.columns:
            frame["source_path"] = str(csv_path)
        else:
            frame["source_path"] = frame["source_path"].fillna(str(csv_path))
        frame["source_file"] = frame["source_file"].astype(str)
        frame["source_path"] = frame["source_path"].astype(str)
        frames.append(frame)
    return pd.concat(frames, ignore_index=True)

summarize_category_timecourse(state_traces)

Summarize category-conditioned state stability over time.

Category metrics are subject-balanced: trials/sequences are averaged within each subject first, then subject means are averaged at each class/time point.

Source code in src/neureptrace/semantic_stages.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def summarize_category_timecourse(state_traces: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """Summarize category-conditioned state stability over time.

    Category metrics are subject-balanced: trials/sequences are averaged within
    each subject first, then subject means are averaged at each class/time point.
    """
    columns = posterior_columns(state_traces)
    state_names = _state_names(state_traces, columns)
    if "true_class" not in state_traces.columns:
        return summarize_dominant_timecourse(state_traces), state_names

    aligned = _add_true_class_alignment(state_traces, columns, state_names)
    group_columns = [*_stage_group_columns(aligned), "true_class", "time"]
    summary = _subject_balanced_category_summary(aligned, group_columns)
    summary = summary.sort_values(group_columns).reset_index(drop=True)
    return summary, state_names

summarize_dominant_timecourse(state_traces)

Summarize dominant latent-state stability when true category labels are absent.

Source code in src/neureptrace/semantic_stages.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def summarize_dominant_timecourse(state_traces: pd.DataFrame) -> pd.DataFrame:
    """Summarize dominant latent-state stability when true category labels are absent."""
    columns = posterior_columns(state_traces)
    state_names = _state_names(state_traces, columns)
    frame = state_traces.copy()
    frame["sequence_key"] = _sequence_keys(frame)
    rows = []
    group_columns = _stage_group_columns(frame)
    for keys, group in frame.groupby([*group_columns, "time"], sort=True):
        key_values = keys if isinstance(keys, tuple) else (keys,)
        group_values = dict(zip([*group_columns, "time"], key_values, strict=True))
        subject_posteriors = group.groupby("subject", sort=True)[columns].mean()
        means = subject_posteriors.mean().to_numpy(dtype=float)
        order = np.argsort(means)
        dominant_index = int(order[-1])
        runner_up = float(means[order[-2]]) if len(order) > 1 else float("nan")
        dominant_class = state_names[dominant_index]
        subject_viterbi_match = (
            group.assign(viterbi_matches_dominant=group["viterbi_class"].astype(str) == dominant_class)
            .groupby("subject", sort=True)["viterbi_matches_dominant"]
            .mean()
        )
        if "viterbi_posterior" in group.columns:
            viterbi_posterior_mean = float(group.groupby("subject", sort=True)["viterbi_posterior"].mean().mean())
        else:
            viterbi_posterior_mean = float(means[dominant_index])
        rows.append(
            {
                "decoder": group_values.get("decoder", "decoder"),
                "emission_mode": group_values.get("emission_mode", "calibrated"),
                "true_class": dominant_class,
                "time": float(group_values["time"]),
                "n_observations": len(group),
                "n_subjects": group["subject"].nunique(),
                "n_sequences": group["sequence_key"].nunique(),
                "posterior_true_class_mean": float(means[dominant_index]),
                "posterior_true_class_sem": _time_sem(subject_posteriors.iloc[:, dominant_index]),
                "viterbi_match_rate": float(subject_viterbi_match.mean()),
                "viterbi_posterior_mean": viterbi_posterior_mean,
                "posterior_margin": float(means[dominant_index] - runner_up) if len(order) > 1 else float("nan"),
            }
        )
    result = pd.DataFrame(rows)
    if result.empty:
        return result
    return result.sort_values([*_stage_group_columns(result), "true_class", "time"]).reset_index(drop=True)