Skip to content

Metadata

neureptrace.metadata

add_binary_label(metadata, *, source_column, positive_pattern, label_column, negative_pattern=None, positive_label='positive', negative_label='negative', case_sensitive=False)

Add a binary label column by matching text patterns in an existing column.

When negative_pattern is omitted, every non-null source value that does not match positive_pattern receives the negative label. When negative_pattern is provided, unmatched rows receive missing labels and rows that match both patterns keep the positive label.

Source code in src/neureptrace/metadata.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def add_binary_label(
    metadata: pd.DataFrame,
    *,
    source_column: str,
    positive_pattern: str,
    label_column: str,
    negative_pattern: str | None = None,
    positive_label: str = "positive",
    negative_label: str = "negative",
    case_sensitive: bool = False,
) -> pd.DataFrame:
    """Add a binary label column by matching text patterns in an existing column.

    When ``negative_pattern`` is omitted, every non-null source value that does
    not match ``positive_pattern`` receives the negative label. When
    ``negative_pattern`` is provided, unmatched rows receive missing labels and
    rows that match both patterns keep the positive label.
    """
    if source_column not in metadata.columns:
        raise ValueError(f"Source column '{source_column}' not found in metadata.")
    if label_column in metadata.columns:
        raise ValueError(f"Label column '{label_column}' already exists.")

    flags = 0 if case_sensitive else re.IGNORECASE
    source = metadata[source_column].astype("string")
    positive = source.str.contains(positive_pattern, flags=flags, regex=True, na=False)
    if negative_pattern is None:
        negative = source.notna() & ~positive
    else:
        negative = source.str.contains(negative_pattern, flags=flags, regex=True, na=False) & ~positive

    labeled = metadata.copy()
    labeled[label_column] = pd.NA
    labeled.loc[positive, label_column] = positive_label
    labeled.loc[negative, label_column] = negative_label
    return labeled

prepare_binary_metadata(events_csv, out_path, *, source_column, positive_pattern, label_column, negative_pattern=None, positive_label='positive', negative_label='negative', case_sensitive=False)

Load metadata, add a binary label, and write the result as CSV.

Source code in src/neureptrace/metadata.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def prepare_binary_metadata(
    events_csv: Path,
    out_path: Path,
    *,
    source_column: str,
    positive_pattern: str,
    label_column: str,
    negative_pattern: str | None = None,
    positive_label: str = "positive",
    negative_label: str = "negative",
    case_sensitive: bool = False,
) -> pd.DataFrame:
    """Load metadata, add a binary label, and write the result as CSV."""
    metadata = pd.read_csv(events_csv)
    labeled = add_binary_label(
        metadata,
        source_column=source_column,
        positive_pattern=positive_pattern,
        negative_pattern=negative_pattern,
        label_column=label_column,
        positive_label=positive_label,
        negative_label=negative_label,
        case_sensitive=case_sensitive,
    )
    out_path.parent.mkdir(parents=True, exist_ok=True)
    labeled.to_csv(out_path, index=False)
    return labeled