From 9d4c3ccf31b2c84c3d16585d94eb1b2682bf6773 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Mon, 10 Nov 2025 09:48:43 -0500 Subject: [PATCH 1/2] Add label index support for aggregation --- tiledb/aggregation.py | 6 +++ tiledb/multirange_indexing.py | 71 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tiledb/aggregation.py b/tiledb/aggregation.py index b84360bb2f..31e9b575fd 100644 --- a/tiledb/aggregation.py +++ b/tiledb/aggregation.py @@ -79,6 +79,12 @@ def __getitem__(self, selection): return result + def label_index(self, labels): + """Apply Array.label_index with query parameters.""" + from .multirange_indexing import LabelAggregation + + return LabelAggregation(self.query.array, tuple(labels), query=self) + @property def multi_index(self): """Apply Array.multi_index with query parameters.""" diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 2ec8c5be21..537caf22dd 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -453,6 +453,77 @@ def _run_query(self) -> Dict[str, np.ndarray]: return result +class LabelAggregation(MultiRangeAggregation): + """ + Implements multi-range aggregation indexing by label. + """ + + def __init__( + self, + array: Array, + labels: Sequence[str], + query: Optional[AggregationProxy] = None, + ): + if array.schema.sparse: + raise NotImplementedError( + "querying sparse arrays by label is not yet implemented" + ) + super().__init__(array, query) + self.label_query: Optional[Query] = None + self._labels: Dict[int, str] = {} + for label_name in labels: + dim_label = array.schema.dim_label(label_name) + dim_idx = dim_label.dim_index + if dim_idx in self._labels: + raise TileDBError( + f"cannot set labels `{self._labels[dim_idx]}` and " + f"`{label_name}` defined on the same dimension" + ) + self._labels[dim_idx] = label_name + + def _set_ranges(self, idx): + dim_ranges, label_ranges = getitem_ranges_with_labels( + self.array, self._labels, idx + ) + if label_ranges is None: + with timing("add_ranges"): + self.subarray.add_ranges(tuple(dim_ranges)) + # No label query. + self.label_query = None + # All ranges are finalized: set shape and subarray now. + self._set_shape(dim_ranges) + self.pyquery.set_subarray(self.subarray) + else: + label_subarray = Subarray(self.array) + with timing("add_ranges"): + self.subarray.add_ranges(dim_ranges=dim_ranges) + label_subarray.add_ranges(label_ranges=label_ranges) + self.label_query = Query(self.array) + self.label_query.set_subarray(label_subarray) + + def _run_query(self) -> Dict[str, np.ndarray]: + # If querying by label and the label query is not yet complete, run the label + # query and update the pyquery with the actual dimensions. + if self.label_query is not None and not self.label_query.is_complete(): + self.label_query._submit() + + if not self.label_query.is_complete(): + raise TileDBError("failed to get dimension ranges from labels") + label_subarray = self.label_query.subarray() + # Check that the label query returned results for all dimensions. + if any( + label_subarray.num_dim_ranges(dim_idx) == 0 for dim_idx in self._labels + ): + self.pyquery = None + else: + # Get the ranges from the label query and set to the + self.subarray.copy_ranges( + self.label_query.subarray(), self._labels.keys() + ) + self.pyquery.set_subarray(self.subarray) + return super()._run_query() + + class DataFrameIndexer(_BaseIndexer): """ Implements `.df[]` indexing to directly return a dataframe From 6d95d4d01882be4d98a4ba4778057136b71a3c3f Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Mon, 10 Nov 2025 09:48:45 -0500 Subject: [PATCH 2/2] Add test --- tiledb/tests/test_dimension_label.py | 74 ++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tiledb/tests/test_dimension_label.py b/tiledb/tests/test_dimension_label.py index d553c43876..379091b15a 100644 --- a/tiledb/tests/test_dimension_label.py +++ b/tiledb/tests/test_dimension_label.py @@ -483,3 +483,77 @@ def test_dimension_label_on_query(self): }, ), ) + + @pytest.mark.skipif( + tiledb.libtiledb.version() < (2, 15), + reason="dimension labels requires libtiledb version 2.15 or greater", + ) + def test_dimension_label_on_aggregation(self): + uri = self.path("aggregation_label_index") + + dim1 = tiledb.Dim("d1", domain=(0, 3), dtype=np.int32) + dim2 = tiledb.Dim("d2", domain=(0, 2), dtype=np.int32) + dom = tiledb.Domain(dim1, dim2) + att = tiledb.Attr("a1", dtype=np.int64) + dim_labels = { + 0: {"l1": dim1.create_label_schema("increasing", np.int64)}, + 1: {"l2": dim2.create_label_schema("increasing", np.float64)}, + } + schema = tiledb.ArraySchema(domain=dom, attrs=(att,), dim_labels=dim_labels) + tiledb.Array.create(uri, schema) + + # Create data: [[10, 20, 30], [40, 50, 60], [70, 80, 90], [100, 110, 120]] + a1_data = np.reshape(np.arange(10, 130, 10), (4, 3)) + l1_data = np.array([100, 200, 300, 400], dtype=np.int64) + l2_data = np.array([1.0, 2.0, 3.0], dtype=np.float64) + + with tiledb.open(uri, "w") as A: + A[:] = {"a1": a1_data, "l1": l1_data, "l2": l2_data} + + with tiledb.open(uri, "r") as A: + # Test sum aggregation with single dimension label + q = A.query(attrs="", dims=["d1"]) + result = q.agg("sum").label_index(["l1"])[200:300] + # Sum of rows 1 and 2: [40, 50, 60] + [70, 80, 90] = 390 + assert result == 390 + + # Test count aggregation + result = q.agg("count").label_index(["l1"])[100:400] + # All 4 rows, 3 columns each = 12 elements + assert result == 12 + + # Test mean aggregation + result = q.agg("mean").label_index(["l1"])[200:300] + # Mean of [40, 50, 60, 70, 80, 90] = 65.0 + assert result == 65.0 + + # Test min aggregation + result = q.agg("min").label_index(["l1"])[200:300] + # Min of [40, 50, 60, 70, 80, 90] = 40 + assert result == 40 + + # Test max aggregation + result = q.agg("max").label_index(["l1"])[200:300] + # Max of [40, 50, 60, 70, 80, 90] = 90 + assert result == 90 + + # Test with second dimension label (floating point) + result = q.agg("sum").label_index(["l2"])[:, 2.0:3.0] + # Sum of columns 1 and 2: [20, 50, 80, 110] + [30, 60, 90, 120] = 560 + assert result == 560 + + # Test with multiple dimension labels + result = q.agg("sum").label_index(["l1", "l2"])[200:300, 1.0:2.0] + # Sum of rows 1-2, columns 0-1: [40, 50, 70, 80] = 240 + assert result == 240 + + # Test single point selection + result = q.agg("sum").label_index(["l1"])[200:200] + # Sum of row 1: [40, 50, 60] = 150 + assert result == 150 + + # Test with multiple aggregations + result = q.agg(["sum", "mean"]).label_index(["l1"])[100:200] + # Rows 0-1: [10, 20, 30, 40, 50, 60] + assert result["sum"] == 210 + assert result["mean"] == 35.0