Skip to content

Sc memmap

create_metadata(file_path, shared_dict)

Extract a series of metadata values from AnnData required to process all files into memmaps.

Note: it assumes var.feature_ids contains the gene symbols for each dataset and corresponds to the same order as the data.X columns.

Parameters:

Name Type Description Default
file_path PosixPath

Path to AnnData stored as *.h5ad.

required
shared_dict Dict[str, Dict[str, object]]

Dictionary to store the extracted metadata.

required

Returns:

Name Type Description
None None

If the file cannot be read or if the data object is None.

Source code in bionemo/geneformer/scripts/sc_memmap.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def create_metadata(file_path: Path, shared_dict: Dict[str, Dict[str, object]]) -> None:
    """Extract a series of metadata values from `AnnData` required to process all files into memmaps.

    Note: it assumes var.feature_ids contains the gene symbols for each dataset and corresponds to the same order as the data.X columns.

    Args:
        file_path (PosixPath):
            Path to `AnnData` stored as *.h5ad.
        shared_dict (Dict[str, Dict[str, object]]):
            Dictionary to store the extracted metadata.

    Returns:
        None:
            If the file cannot be read or if the `data` object is None.

    """
    try:
        data = scanpy.read_h5ad(file_path)
    except Exception as e:
        raise ValueError(f"Could not read {file_path}") from e

    if data is None:
        return

    shape = data.shape
    feature_ids = list(data.var.feature_id)

    if data.raw is not None:
        X = data.raw.X
    else:
        X = data.X

    num_el = X.count_nonzero()  # Count the number of non-zero elements in the sparse array, in total
    # - metadata associated with each file
    d = {"shape": shape, "feature_ids": feature_ids, "num_el": num_el, "file_path": str(file_path)}

    shared_dict[str(file_path)] = d

find_ann_data_files(data_path)

Find all AnnData files with the extension '.h5ad' in the given data path and its subdirectories.

Parameters:

Name Type Description Default
data_path str

The path to the directory containing the AnnData files.

required

Returns:

Type Description
List[Path]

List[str]: A list of file paths to the AnnData files.

Source code in bionemo/geneformer/scripts/sc_memmap.py
163
164
165
166
167
168
169
170
171
172
def find_ann_data_files(data_path: Path) -> List[Path]:
    """Find all AnnData files with the extension '.h5ad' in the given data path and its subdirectories.

    Args:
        data_path (str): The path to the directory containing the AnnData files.

    Returns:
        List[str]: A list of file paths to the AnnData files.
    """
    return sorted(data_path.rglob("*.h5ad"))

write_data(file_path, obs_cols, metadata, gene_data, gene_data_indices, gene_data_ptr, strict=False)

Writes AnnData into memmap.

Parameters:

Name Type Description Default
file_path PosixPath

The path to the file.

required
obs_cols List[str]

A list of columns to extract from each AnnData obs dataframe.

required
metadata Dict[str, Dict[str, object]]

A dictionary containing metadata information on number of elements, shape, and feature names.

required
gene_data ndarray

The array to store gene data.

required
gene_data_indices ndarray

The array to store gene data indices.

required
gene_data_ptr ndarray

The array to store gene data pointers.

required
strict bool

If True, only extract the columns specified in obs_cols.

False

Returns:

Type Description
List[DataFrame]

List[pd.DataFrame]: The features extracted from the data.

Source code in bionemo/geneformer/scripts/sc_memmap.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def write_data(
    file_path: Path,
    obs_cols: list,
    metadata: Dict[str, Dict[str, object]],
    gene_data: np.ndarray,
    gene_data_indices: np.ndarray,
    gene_data_ptr: np.ndarray,
    strict: bool = False,
) -> List[pd.DataFrame]:
    """Writes `AnnData` into memmap.

    Args:
        file_path (PosixPath): The path to the file.
        obs_cols (List[str]): A list of columns to extract from each AnnData `obs` dataframe.
        metadata (Dict[str, Dict[str, object]]): A dictionary containing metadata information
            on number of elements, shape, and feature names.
        gene_data (np.ndarray): The array to store gene data.
        gene_data_indices (np.ndarray): The array to store gene data indices.
        gene_data_ptr (np.ndarray): The array to store gene data pointers.
        strict (bool): If True, only extract the columns specified in `obs_cols`.

    Returns:
        List[pd.DataFrame]: The features extracted from the data.
    """
    # - check if the file name exists in the metadata dictionary
    if str(file_path) not in metadata:
        return []

    # Get the metadata for the file
    meta = metadata[str(file_path)]
    num_el = meta["num_el"]
    running_el = meta["running_el"]
    num_obs = meta["shape"][0]
    cur_count = meta["cur_count"]

    try:
        # - read the data from the file using scanpy
        data = scanpy.read_h5ad(file_path)
    except Exception:
        print(f"couldn't read {file_path}")
        return []

    # - get the gene data from the data object
    X = data.X if data.raw is None else data.raw.X  # Use X if raw is not None, otherwise use raw

    # - store the gene data, indices, and pointers in the respective arrays
    gene_data[running_el : running_el + num_el] = X.data  # This is a flattened array with everything in it.
    gene_data_indices[running_el : running_el + num_el] = X.indices.astype(
        int
    )  # these are flattened column indices eg [0, 1, 2, 0, 1, 3] for a 2x4 sparse matrix
    gene_data_ptr[cur_count : cur_count + num_obs + 1] = X.indptr.astype(int) + int(
        running_el
    )  # These are mappings between row indices and ranges. eg [0, 3, 6] for a 2x4 sparse matrix

    # - extract the features from the data
    # TODO: this doesnt work if obs_column doesnt have the right things in it.
    if not strict:
        new_obs_cols = list(set(data.obs.columns.tolist()) & set(obs_cols))
        features = data.obs[new_obs_cols]
    else:
        features = data.obs[obs_cols]

    # - flush the data arrays to disk
    GLOBAL_LOCK.acquire()
    gene_data.flush()
    gene_data_ptr.flush()
    gene_data_indices.flush()
    GLOBAL_LOCK.release()

    return features