Source code for syng_bts.data_utils

"""Data loading and validation utilities for SyNG-BTS.

This module provides helpers for loading bundled/user data, resolving flexible
input forms (DataFrame, path, bundled name), and validating feature matrices.
"""

from pathlib import Path

import pandas as pd

# Try importlib.resources (Python 3.9+) with fallback to importlib_resources
try:
    from importlib.resources import as_file, files
except ImportError:
    from importlib_resources import as_file, files


def load_bundled_data(
    subdir: str,
    filename: str,
    groups_filename: str | None = None,
) -> tuple[pd.DataFrame, pd.Series | None]:
    """
    Load a bundled Parquet data file from the package's data directory.

    Parameters
    ----------
    subdir : str
        The subdirectory within syng_bts/data/ (e.g., "examples", "transfer").
    filename : str
        The Parquet filename to load (feature-only).
    groups_filename : str or None
        Optional sidecar Parquet filename containing a ``groups`` column.
        When provided and the file exists, a :class:`pd.Series` is returned
        as the second element of the tuple.

    Returns
    -------
    tuple[pd.DataFrame, pd.Series | None]
        ``(features_df, groups_series_or_none)``.  The groups Series shares
        the same index as the features DataFrame.
    """
    features_df = _read_bundled_parquet(subdir, filename)

    groups: pd.Series | None = None
    if groups_filename is not None:
        try:
            groups_df = _read_bundled_parquet(subdir, groups_filename)
            groups = groups_df["groups"]
        except (FileNotFoundError, KeyError):
            groups = None

    return features_df, groups


def _read_bundled_parquet(subdir: str, filename: str) -> pd.DataFrame:
    """Read a single Parquet file from the bundled data directory."""
    try:
        data_package = files("syng_bts.data")
        resource = data_package
        for part in subdir.split("/"):
            resource = resource.joinpath(part)
        resource = resource.joinpath(filename)

        with as_file(resource) as path:
            return pd.read_parquet(path, engine="pyarrow")
    except (TypeError, AttributeError, FileNotFoundError) as e:
        import syng_bts

        package_dir = Path(syng_bts.__file__).parent
        file_path = package_dir / "data" / subdir / filename
        if file_path.exists():
            return pd.read_parquet(file_path, engine="pyarrow")
        raise FileNotFoundError(
            f"Could not find bundled data file: {subdir}/{filename}"
        ) from e


# Map of known bundled datasets to their package locations and subdirectories.
# Format: "dataset_name": ("subdir_path", "features.parquet", "groups.parquet" | None)
BUNDLED_DATASETS: dict[str, tuple[str, str, str | None]] = {
    # Example datasets
    "SKCMPositive_4": ("examples", "SKCMPositive_4.parquet", None),
    # Transfer learning datasets
    "BRCA": ("transfer", "BRCA.parquet", None),
    "PRAD": ("transfer", "PRAD.parquet", None),
    # BRCA subtype case study
    "BRCASubtypeSel": (
        "case/brca_subtype",
        "BRCASubtypeSel.parquet",
        "BRCASubtypeSel_groups.parquet",
    ),
    "BRCASubtypeSel_test": (
        "case/brca_subtype",
        "BRCASubtypeSel_test.parquet",
        "BRCASubtypeSel_test_groups.parquet",
    ),
    "BRCASubtypeSel_train": (
        "case/brca_subtype",
        "BRCASubtypeSel_train.parquet",
        "BRCASubtypeSel_train_groups.parquet",
    ),
    "BRCASubtypeSel_train_epoch285_CVAE1-20_generated": (
        "case/brca_subtype",
        "BRCASubtypeSel_train_epoch285_CVAE1-20_generated.parquet",
        "BRCASubtypeSel_train_epoch285_CVAE1-20_generated_groups.parquet",
    ),
    # LIHC subtype case study
    "LIHCSubtypeFamInd": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd.parquet",
        "LIHCSubtypeFamInd_groups.parquet",
    ),
    "LIHCSubtypeFamInd_DESeq": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd_DESeq.parquet",
        "LIHCSubtypeFamInd_DESeq_groups.parquet",
    ),
    "LIHCSubtypeFamInd_test74": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd_test74.parquet",
        "LIHCSubtypeFamInd_test74_groups.parquet",
    ),
    "LIHCSubtypeFamInd_test74_DESeq": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd_test74_DESeq.parquet",
        "LIHCSubtypeFamInd_test74_DESeq_groups.parquet",
    ),
    "LIHCSubtypeFamInd_train294": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd_train294.parquet",
        "LIHCSubtypeFamInd_train294_groups.parquet",
    ),
    "LIHCSubtypeFamInd_train294_DESeq": (
        "case/lihc_subtype",
        "LIHCSubtypeFamInd_train294_DESeq.parquet",
        "LIHCSubtypeFamInd_train294_DESeq_groups.parquet",
    ),
}



[docs]
def list_bundled_datasets() -> list:
    """
    List all available bundled datasets.

    Returns
    -------
    list
        List of dataset names that can be loaded with :func:`resolve_data`.
    """
    return list(BUNDLED_DATASETS.keys())



def _read_user_file(path: Path) -> pd.DataFrame:
    """Read a user-provided CSV or Parquet file.

    Raises
    ------
    ValueError
        If the file extension is not ``.csv`` or ``.parquet``.
    """
    ext = path.suffix.lower()
    if ext == ".parquet":
        return pd.read_parquet(path, engine="pyarrow")
    if ext == ".csv":
        return pd.read_csv(path, header=0)
    raise ValueError(
        f"Unsupported file type '{ext}' for '{path.name}'. "
        "Only .csv and .parquet are supported."
    )


# ---------------------------------------------------------------------------
# Strict data-contract validator
# ---------------------------------------------------------------------------

_METADATA_COLUMNS = {"groups", "samples"}


def _validate_feature_data(df: pd.DataFrame) -> None:
    """Validate that a DataFrame conforms to the feature-only contract.

    Experiment entry points should call this **after** resolving the data
    (i.e. after ``resolve_data()``).  The validator checks:

    1. All columns are numeric.
    2. No metadata-like columns (``groups``, ``samples``) are present.
    3. The index contains unique values (used as sample identifiers).

    Parameters
    ----------
    df : pd.DataFrame
       The feature DataFrame to validate.

    Raises
    ------
    ValueError
        If any of the above constraints are violated.
    """
    # 1. Reject metadata-like column names
    bad_cols = _METADATA_COLUMNS & {c.lower() for c in df.columns}
    if bad_cols:
        offending = [c for c in df.columns if c.lower() in bad_cols]
        raise ValueError(
            f"DataFrame contains metadata column(s) {offending!r} which must "
            "not be included in the feature matrix. Pass group labels via the "
            "'groups' parameter instead and ensure sample IDs are in the "
            "DataFrame index, not a column."
        )

    # 2. All columns must be numeric
    non_numeric = df.select_dtypes(exclude="number").columns.tolist()
    if non_numeric:
        raise ValueError(
            f"DataFrame contains non-numeric column(s): {non_numeric!r}. "
            "Only numeric feature columns are allowed. Remove or convert "
            "non-numeric columns before passing to experiment functions."
        )

    # 3. Index must be unique
    if not df.index.is_unique:
        n_dup = df.index.duplicated().sum()
        raise ValueError(
            f"DataFrame index contains {n_dup} duplicate value(s). "
            "Each row must have a unique identifier (sample ID) as its index."
        )



[docs]
def resolve_data(
    data: "pd.DataFrame | str | Path",
) -> "tuple[pd.DataFrame, pd.Series | None]":
    """
    Resolve a flexible data input to a pandas DataFrame and optional groups.

    Accepts a DataFrame (returned as-is with ``None`` groups), a file path
    (loaded via ``pd.read_csv`` / ``pd.read_parquet``), or the name of a
    bundled dataset.

    Parameters
    ----------
    data : pd.DataFrame, str, or Path
        One of:

        - A ``pd.DataFrame`` — returned directly with groups ``None``.
        - A ``str`` or ``Path`` pointing to an existing CSV or Parquet file
          (must include an extension such as ``.csv`` or ``.parquet``).
        - A plain name (no extension, no path separators) of a bundled
          dataset, e.g. ``"SKCMPositive_4"``.

    Returns
    -------
    tuple[pd.DataFrame, pd.Series | None]
        ``(features_df, groups_or_none)``.  Groups are a :class:`pd.Series`
        only when the input is a bundled dataset that ships with a groups
        sidecar.  For user-provided files and DataFrames, groups are
        always ``None``.

    Raises
    ------
    ValueError
        If *data* looks like a bundled-dataset name but is not found in the
        registry.  The error message lists all available bundled datasets.
    FileNotFoundError
        If *data* looks like a file path but the file does not exist.
    TypeError
        If *data* is not a DataFrame, str, or Path.

    Examples
    --------
    >>> from syng_bts.data_utils import resolve_data
    >>> df, groups = resolve_data("SKCMPositive_4")          # bundled
    >>> df, groups = resolve_data("./my_data/custom.csv")    # file path
    >>> df, groups = resolve_data(existing_dataframe)         # pass-through
    """
    # 1. DataFrame pass-through
    if isinstance(data, pd.DataFrame):
        return data, None

    # 2. Convert to string for inspection
    if isinstance(data, Path):
        data_str = str(data)
    elif isinstance(data, str):
        data_str = data
    else:
        raise TypeError(
            f"'data' must be a pd.DataFrame, str, or Path, got {type(data).__name__}"
        )

    path = Path(data_str)

    # 3. If it looks like a real file path (has path separators), try to load it
    has_separators = "/" in data_str or "\\" in data_str
    if has_separators:
        if path.exists():
            return _read_user_file(path), None
        raise FileNotFoundError(f"Data file not found: {path}")

    # 4. Treat as a bundled dataset name — strip .csv/.parquet if the user added it
    name = data_str
    if name.lower().endswith(".csv"):
        name = name[: -len(".csv")]
    elif name.lower().endswith(".parquet"):
        name = name[: -len(".parquet")]

    bundled_info = BUNDLED_DATASETS.get(name)
    if bundled_info is not None:
        subdir, filename, groups_filename = bundled_info
        return load_bundled_data(subdir, filename, groups_filename)

    # 5. Last resort: try as a local file (e.g. "myfile.csv" in cwd)
    if path.suffix and path.exists():
        return _read_user_file(path), None

    available = ", ".join(sorted(BUNDLED_DATASETS.keys()))
    raise ValueError(
        f"Unknown dataset name '{name}'. Available bundled datasets: {available}"
    )



def _derive_dataname(
    data: "pd.DataFrame | str | Path",
    name: "str | None" = None,
) -> str:
    """
    Derive a short human-readable name for a dataset.

    The name is used in output filenames and metadata.  An explicit *name*
    always takes priority.

    Parameters
    ----------
    data : pd.DataFrame, str, or Path
        The original ``data`` argument the user passed.
    name : str or None
        Explicit override.  When provided, returned as-is.

    Returns
    -------
    str
        A short identifier for the dataset.
    """
    if name is not None:
        return name

    if isinstance(data, (str, Path)):
        p = Path(data)
        # Strip .csv extension if present for bundled-name lookup
        stem = p.stem if p.suffix else str(p)
        # If user passed a bare name like "SKCMPositive_4", stem == name
        return stem

    # DataFrame: try df.attrs["name"], fall back to "data"
    if isinstance(data, pd.DataFrame):
        return data.attrs.get("name", "data")

    return "data"