Sc memmap

`create_metadata(file_path, shared_dict)`

Extract a series of metadata values from AnnData required to process all files into memmaps.

Note: it assumes var.feature_ids contains the gene symbols for each dataset and corresponds to the same order as the data.X columns.

Parameters:

Name	Type	Description	Default
`file_path`	`PosixPath`	Path to `AnnData` stored as *.h5ad.	required
`shared_dict`	`Dict[str, Dict[str, object]]`	Dictionary to store the extracted metadata.	required

Returns:

Name	Type	Description
`None`	`None`	If the file cannot be read or if the `data` object is None.

Source code in bionemo/geneformer/scripts/sc_memmap.py

def create_metadata(file_path: Path, shared_dict: Dict[str, Dict[str, object]]) -> None:
    """Extract a series of metadata values from `AnnData` required to process all files into memmaps.

    Note: it assumes var.feature_ids contains the gene symbols for each dataset and corresponds to the same order as the data.X columns.

    Args:
        file_path (PosixPath):
            Path to `AnnData` stored as *.h5ad.
        shared_dict (Dict[str, Dict[str, object]]):
            Dictionary to store the extracted metadata.

    Returns:
        None:
            If the file cannot be read or if the `data` object is None.

    """
    try:
        data = scanpy.read_h5ad(file_path)
    except Exception as e:
        raise ValueError(f"Could not read {file_path}") from e

    if data is None:
        return

    shape = data.shape
    feature_ids = list(data.var.feature_id)

    if data.raw is not None:
        X = data.raw.X
    else:
        X = data.X

    num_el = X.count_nonzero()  # Count the number of non-zero elements in the sparse array, in total
    # - metadata associated with each file
    d = {"shape": shape, "feature_ids": feature_ids, "num_el": num_el, "file_path": str(file_path)}

    shared_dict[str(file_path)] = d

`find_ann_data_files(data_path)`

Find all AnnData files with the extension '.h5ad' in the given data path and its subdirectories.

Parameters:

Name	Type	Description	Default
`data_path`	`str`	The path to the directory containing the AnnData files.	required

Returns:

Type	Description
`List[Path]`	List[str]: A list of file paths to the AnnData files.

Source code in bionemo/geneformer/scripts/sc_memmap.py

def find_ann_data_files(data_path: Path) -> List[Path]:
    """Find all AnnData files with the extension '.h5ad' in the given data path and its subdirectories.

    Args:
        data_path (str): The path to the directory containing the AnnData files.

    Returns:
        List[str]: A list of file paths to the AnnData files.
    """
    return sorted(data_path.rglob("*.h5ad"))

`write_data(file_path, obs_cols, metadata, gene_data, gene_data_indices, gene_data_ptr, strict=False)`

Writes AnnData into memmap.

Parameters:

Name	Type	Description	Default
`file_path`	`PosixPath`	The path to the file.	required
`obs_cols`	`List[str]`	A list of columns to extract from each AnnData `obs` dataframe.	required
`metadata`	`Dict[str, Dict[str, object]]`	A dictionary containing metadata information on number of elements, shape, and feature names.	required
`gene_data`	`ndarray`	The array to store gene data.	required
`gene_data_indices`	`ndarray`	The array to store gene data indices.	required
`gene_data_ptr`	`ndarray`	The array to store gene data pointers.	required
`strict`	`bool`	If True, only extract the columns specified in `obs_cols`.	`False`

Returns:

Type	Description
`List[DataFrame]`	List[pd.DataFrame]: The features extracted from the data.

Source code in bionemo/geneformer/scripts/sc_memmap.py

def write_data(
    file_path: Path,
    obs_cols: list,
    metadata: Dict[str, Dict[str, object]],
    gene_data: np.ndarray,
    gene_data_indices: np.ndarray,
    gene_data_ptr: np.ndarray,
    strict: bool = False,
) -> List[pd.DataFrame]:
    """Writes `AnnData` into memmap.

    Args:
        file_path (PosixPath): The path to the file.
        obs_cols (List[str]): A list of columns to extract from each AnnData `obs` dataframe.
        metadata (Dict[str, Dict[str, object]]): A dictionary containing metadata information
            on number of elements, shape, and feature names.
        gene_data (np.ndarray): The array to store gene data.
        gene_data_indices (np.ndarray): The array to store gene data indices.
        gene_data_ptr (np.ndarray): The array to store gene data pointers.
        strict (bool): If True, only extract the columns specified in `obs_cols`.

    Returns:
        List[pd.DataFrame]: The features extracted from the data.
    """
    # - check if the file name exists in the metadata dictionary
    if str(file_path) not in metadata:
        return []

    # Get the metadata for the file
    meta = metadata[str(file_path)]
    num_el = meta["num_el"]
    running_el = meta["running_el"]
    num_obs = meta["shape"][0]
    cur_count = meta["cur_count"]

    try:
        # - read the data from the file using scanpy
        data = scanpy.read_h5ad(file_path)
    except Exception:
        print(f"couldn't read {file_path}")
        return []

    # - get the gene data from the data object
    X = data.X if data.raw is None else data.raw.X  # Use X if raw is not None, otherwise use raw

    # - store the gene data, indices, and pointers in the respective arrays
    gene_data[running_el : running_el + num_el] = X.data  # This is a flattened array with everything in it.
    gene_data_indices[running_el : running_el + num_el] = X.indices.astype(
        int
    )  # these are flattened column indices eg [0, 1, 2, 0, 1, 3] for a 2x4 sparse matrix
    gene_data_ptr[cur_count : cur_count + num_obs + 1] = X.indptr.astype(int) + int(
        running_el
    )  # These are mappings between row indices and ranges. eg [0, 3, 6] for a 2x4 sparse matrix

    # - extract the features from the data
    # TODO: this doesnt work if obs_column doesnt have the right things in it.
    if not strict:
        new_obs_cols = list(set(data.obs.columns.tolist()) & set(obs_cols))
        features = data.obs[new_obs_cols]
    else:
        features = data.obs[obs_cols]

    # - flush the data arrays to disk
    GLOBAL_LOCK.acquire()
    gene_data.flush()
    gene_data_ptr.flush()
    gene_data_indices.flush()
    GLOBAL_LOCK.release()

    return features