Skip to content

Row feature index

RowFeatureIndex

Maintains a mapping between a row and its features.

This is a ragged dataset, where the number and dimension of features can be different at every row.

Attributes:

Name Type Description
_cumulative_sum_index array

Pointer that deliniates which entries

_feature_arr List[DataFrame]

list of feature dataframes

_labels List[str]

list of labels

_version

The version of the dataset

Source code in bionemo/scdl/index/row_feature_index.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class RowFeatureIndex:
    """Maintains a mapping between a row and its features.

    This is a ragged dataset, where the number and dimension of features
    can be different at every row.

    Attributes:
        _cumulative_sum_index: Pointer that deliniates which entries
        correspondto a given row. For examples if the array is [-1, 200, 201],
        rows 0 to 199 correspond to _feature_arr[0] and 200 corresponds to
        _feature_arr[1]
        _feature_arr: list of feature dataframes
        _labels: list of labels
        _version: The version of the dataset
    """

    def __init__(self) -> None:
        """Instantiates the index."""
        self._cumulative_sum_index: np.array = np.array([-1])
        self._feature_arr: List[pd.DataFrame] = []
        self._version = importlib.metadata.version("bionemo.scdl")
        self._labels: List[str] = []

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def __len__(self) -> int:
        """The length is the number of rows or RowFeatureIndex length."""
        return len(self._feature_arr)

    def append_features(self, n_obs: int, features: pd.DataFrame, label: Optional[str] = None) -> None:
        """Updates the index with the given features.

        The dataframe is inserted into the feature array by adding a
        new span to the row lookup index.

        Args:
            n_obs (int): The number of times that these feature occur in the
            class.
            features (pd.DataFrame): Corresponding features.
            label (str): Label for the features.
        """
        csum = max(self._cumulative_sum_index[-1], 0)
        self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
        self._feature_arr.append(features)
        self._labels.append(label)

    def lookup(self, row: int, select_features: Optional[List[str]] = None) -> Tuple[pd.DataFrame, str]:
        """Find the features at a given row.

        It is assumed that the row is
        non-zero._cumulative_sum_index contains pointers to which rows correspond
        to given dataframes. To obtain a specific row, we determine where it is
        located in _cumulative_sum_index and then look up that dataframe in
        _feature_arr
        Args:
            row (int): The row in the feature index.
            select_features (List[str]): a list of features to select
        Returns
            pd.DataFrame: dataframe of features in that row
            str: optional label for the row
        Raises:
            IndexError: An error occured due to input row being negative or it
            exceeding the larger row of the rows in the index. It is also raised
            if there are no entries in the index yet.
        """
        if row < 0:
            raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
        if len(self._cumulative_sum_index) < 2:
            raise IndexError("There are no dataframes to lookup.")

        if row > self._cumulative_sum_index[-1]:
            raise IndexError(
                f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
            )
        # This line does the following:
        # creates a mask for values where cumulative sum > row
        mask = ~(self._cumulative_sum_index > row)
        # Sum these to get the index of the first range > row
        # Subtract one to get the range containing row.
        d_id = sum(mask) - 1

        # Retrieve the features for the identified value.
        features = self._feature_arr[d_id]

        # If specific features are to be selected, filter the features.
        if select_features is not None:
            features = features[select_features]

        # Return the features for the identified range.
        return features, self._labels[d_id]

    def number_vars_at_row(self, row: int) -> int:
        """Return number of variables (legnth of the dataframe) in a given row.

        Args:
            row (int): The row in the feature index.

        Returns:
            The length of the features at the row
        """
        feats, _ = self.lookup(row=row)
        return len(feats)

    def column_dims(self) -> List[int]:
        """Return the number of columns in all rows.

        Args:
            length of features at every row is returned.

        Returns:
            A list containing the lengths of the features in every row
        """
        # Just take the total dim of the DataFrame(s)
        return [len(feats) for feats in self._feature_arr]

    def number_of_values(self) -> List[int]:
        """Get the total number of values in the array.

        For each row, the length of the corresponding dataframe is counted.

        Returns:
            A list containing the lengths of the features in every block of rows
        """
        if len(self._feature_arr) == 0:
            return [0]
        rows = [
            self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
            for i in range(1, len(self._cumulative_sum_index))
        ]

        vals = [n_rows * len(self._feature_arr[i]) for i, n_rows in enumerate(rows)]
        return vals

    def number_of_rows(self) -> int:
        """The number of rows in the dataframe.

        Returns:
            An integer corresponding to the number or rows in the index
        """
        return int(max(self._cumulative_sum_index[-1], 0))

    def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
        """Concatenates the other FeatureIndex to this one.

        Returns the new, updated index. Warning: modifies this index in-place.

        Args:
            other_row_index: another RowFeatureIndex
            fail_on_empty_index: A boolean flag that sets whether to raise an
            error if an empty row index is passed in.

        Returns:
            self, the RowIndexFeature after the concatenations.

        Raises:
            TypeError if other_row_index is not a RowFeatureIndex
            ValueError if an empty RowFeatureIndex is passed and the function is
            set to fail in this case.
        """
        match other_row_index:
            case self.__class__():
                pass
            case _:
                raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

        if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
            raise ValueError("Error: Cannot append empty FeatureIndex.")
        for i, feats in enumerate(list(other_row_index._feature_arr)):
            c_span = other_row_index._cumulative_sum_index[i + 1]
            label = other_row_index._labels[i]
            self.append_features(c_span, feats, label)

        return self

    def save(self, datapath: str) -> None:
        """Saves the RowFeatureIndex to a given path.

        Args:
            datapath: path to save the index
        """
        Path(datapath).mkdir(parents=True, exist_ok=True)
        num_digits = len(str(len(self._feature_arr)))

        for dataframe_index, dataframe in enumerate(self._feature_arr):
            dataframe_str_index = f"{dataframe_index:0{num_digits}d}"
            dataframe.to_parquet(f"{datapath}/dataframe_{dataframe_str_index}.parquet", index=False)
        np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
        np.save(Path(datapath) / "labels.npy", self._labels)
        np.save(Path(datapath) / "version.npy", np.array(self._version))

    @staticmethod
    def load(datapath: str) -> RowFeatureIndex:
        """Loads the data from datapath.

        Args:
            datapath: the path to load from
        Returns:
            An instance of RowFeatureIndex
        """
        new_row_feat_index = RowFeatureIndex()
        parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
        new_row_feat_index._feature_arr = [pd.read_parquet(csv_path) for csv_path in parquet_data_paths]
        new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
        new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
        new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
        return new_row_feat_index

__init__()

Instantiates the index.

Source code in bionemo/scdl/index/row_feature_index.py
45
46
47
48
49
50
def __init__(self) -> None:
    """Instantiates the index."""
    self._cumulative_sum_index: np.array = np.array([-1])
    self._feature_arr: List[pd.DataFrame] = []
    self._version = importlib.metadata.version("bionemo.scdl")
    self._labels: List[str] = []

__len__()

The length is the number of rows or RowFeatureIndex length.

Source code in bionemo/scdl/index/row_feature_index.py
59
60
61
def __len__(self) -> int:
    """The length is the number of rows or RowFeatureIndex length."""
    return len(self._feature_arr)

append_features(n_obs, features, label=None)

Updates the index with the given features.

The dataframe is inserted into the feature array by adding a new span to the row lookup index.

Parameters:

Name Type Description Default
n_obs int

The number of times that these feature occur in the

required
features DataFrame

Corresponding features.

required
label str

Label for the features.

None
Source code in bionemo/scdl/index/row_feature_index.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def append_features(self, n_obs: int, features: pd.DataFrame, label: Optional[str] = None) -> None:
    """Updates the index with the given features.

    The dataframe is inserted into the feature array by adding a
    new span to the row lookup index.

    Args:
        n_obs (int): The number of times that these feature occur in the
        class.
        features (pd.DataFrame): Corresponding features.
        label (str): Label for the features.
    """
    csum = max(self._cumulative_sum_index[-1], 0)
    self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
    self._feature_arr.append(features)
    self._labels.append(label)

column_dims()

Return the number of columns in all rows.

Returns:

Type Description
List[int]

A list containing the lengths of the features in every row

Source code in bionemo/scdl/index/row_feature_index.py
137
138
139
140
141
142
143
144
145
146
147
def column_dims(self) -> List[int]:
    """Return the number of columns in all rows.

    Args:
        length of features at every row is returned.

    Returns:
        A list containing the lengths of the features in every row
    """
    # Just take the total dim of the DataFrame(s)
    return [len(feats) for feats in self._feature_arr]

concat(other_row_index, fail_on_empty_index=True)

Concatenates the other FeatureIndex to this one.

Returns the new, updated index. Warning: modifies this index in-place.

Parameters:

Name Type Description Default
other_row_index RowFeatureIndex

another RowFeatureIndex

required
fail_on_empty_index bool

A boolean flag that sets whether to raise an

True

Returns:

Type Description
RowFeatureIndex

self, the RowIndexFeature after the concatenations.

Source code in bionemo/scdl/index/row_feature_index.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
    """Concatenates the other FeatureIndex to this one.

    Returns the new, updated index. Warning: modifies this index in-place.

    Args:
        other_row_index: another RowFeatureIndex
        fail_on_empty_index: A boolean flag that sets whether to raise an
        error if an empty row index is passed in.

    Returns:
        self, the RowIndexFeature after the concatenations.

    Raises:
        TypeError if other_row_index is not a RowFeatureIndex
        ValueError if an empty RowFeatureIndex is passed and the function is
        set to fail in this case.
    """
    match other_row_index:
        case self.__class__():
            pass
        case _:
            raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

    if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
        raise ValueError("Error: Cannot append empty FeatureIndex.")
    for i, feats in enumerate(list(other_row_index._feature_arr)):
        c_span = other_row_index._cumulative_sum_index[i + 1]
        label = other_row_index._labels[i]
        self.append_features(c_span, feats, label)

    return self

load(datapath) staticmethod

Loads the data from datapath.

Parameters:

Name Type Description Default
datapath str

the path to load from

required

Returns: An instance of RowFeatureIndex

Source code in bionemo/scdl/index/row_feature_index.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
@staticmethod
def load(datapath: str) -> RowFeatureIndex:
    """Loads the data from datapath.

    Args:
        datapath: the path to load from
    Returns:
        An instance of RowFeatureIndex
    """
    new_row_feat_index = RowFeatureIndex()
    parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
    new_row_feat_index._feature_arr = [pd.read_parquet(csv_path) for csv_path in parquet_data_paths]
    new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
    new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
    new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
    return new_row_feat_index

lookup(row, select_features=None)

Find the features at a given row.

It is assumed that the row is non-zero._cumulative_sum_index contains pointers to which rows correspond to given dataframes. To obtain a specific row, we determine where it is located in _cumulative_sum_index and then look up that dataframe in _feature_arr Args: row (int): The row in the feature index. select_features (List[str]): a list of features to select Returns pd.DataFrame: dataframe of features in that row str: optional label for the row Raises: IndexError: An error occured due to input row being negative or it exceeding the larger row of the rows in the index. It is also raised if there are no entries in the index yet.

Source code in bionemo/scdl/index/row_feature_index.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def lookup(self, row: int, select_features: Optional[List[str]] = None) -> Tuple[pd.DataFrame, str]:
    """Find the features at a given row.

    It is assumed that the row is
    non-zero._cumulative_sum_index contains pointers to which rows correspond
    to given dataframes. To obtain a specific row, we determine where it is
    located in _cumulative_sum_index and then look up that dataframe in
    _feature_arr
    Args:
        row (int): The row in the feature index.
        select_features (List[str]): a list of features to select
    Returns
        pd.DataFrame: dataframe of features in that row
        str: optional label for the row
    Raises:
        IndexError: An error occured due to input row being negative or it
        exceeding the larger row of the rows in the index. It is also raised
        if there are no entries in the index yet.
    """
    if row < 0:
        raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
    if len(self._cumulative_sum_index) < 2:
        raise IndexError("There are no dataframes to lookup.")

    if row > self._cumulative_sum_index[-1]:
        raise IndexError(
            f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
        )
    # This line does the following:
    # creates a mask for values where cumulative sum > row
    mask = ~(self._cumulative_sum_index > row)
    # Sum these to get the index of the first range > row
    # Subtract one to get the range containing row.
    d_id = sum(mask) - 1

    # Retrieve the features for the identified value.
    features = self._feature_arr[d_id]

    # If specific features are to be selected, filter the features.
    if select_features is not None:
        features = features[select_features]

    # Return the features for the identified range.
    return features, self._labels[d_id]

number_of_rows()

The number of rows in the dataframe.

Returns:

Type Description
int

An integer corresponding to the number or rows in the index

Source code in bionemo/scdl/index/row_feature_index.py
167
168
169
170
171
172
173
def number_of_rows(self) -> int:
    """The number of rows in the dataframe.

    Returns:
        An integer corresponding to the number or rows in the index
    """
    return int(max(self._cumulative_sum_index[-1], 0))

number_of_values()

Get the total number of values in the array.

For each row, the length of the corresponding dataframe is counted.

Returns:

Type Description
List[int]

A list containing the lengths of the features in every block of rows

Source code in bionemo/scdl/index/row_feature_index.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def number_of_values(self) -> List[int]:
    """Get the total number of values in the array.

    For each row, the length of the corresponding dataframe is counted.

    Returns:
        A list containing the lengths of the features in every block of rows
    """
    if len(self._feature_arr) == 0:
        return [0]
    rows = [
        self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
        for i in range(1, len(self._cumulative_sum_index))
    ]

    vals = [n_rows * len(self._feature_arr[i]) for i, n_rows in enumerate(rows)]
    return vals

number_vars_at_row(row)

Return number of variables (legnth of the dataframe) in a given row.

Parameters:

Name Type Description Default
row int

The row in the feature index.

required

Returns:

Type Description
int

The length of the features at the row

Source code in bionemo/scdl/index/row_feature_index.py
125
126
127
128
129
130
131
132
133
134
135
def number_vars_at_row(self, row: int) -> int:
    """Return number of variables (legnth of the dataframe) in a given row.

    Args:
        row (int): The row in the feature index.

    Returns:
        The length of the features at the row
    """
    feats, _ = self.lookup(row=row)
    return len(feats)

save(datapath)

Saves the RowFeatureIndex to a given path.

Parameters:

Name Type Description Default
datapath str

path to save the index

required
Source code in bionemo/scdl/index/row_feature_index.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def save(self, datapath: str) -> None:
    """Saves the RowFeatureIndex to a given path.

    Args:
        datapath: path to save the index
    """
    Path(datapath).mkdir(parents=True, exist_ok=True)
    num_digits = len(str(len(self._feature_arr)))

    for dataframe_index, dataframe in enumerate(self._feature_arr):
        dataframe_str_index = f"{dataframe_index:0{num_digits}d}"
        dataframe.to_parquet(f"{datapath}/dataframe_{dataframe_str_index}.parquet", index=False)
    np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
    np.save(Path(datapath) / "labels.npy", self._labels)
    np.save(Path(datapath) / "version.npy", np.array(self._version))

version()

Returns a version number.

(following .. convention).

Source code in bionemo/scdl/index/row_feature_index.py
52
53
54
55
56
57
def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version