Skip to content
Open
56 changes: 38 additions & 18 deletions bigframes/display/anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as pd

import bigframes
import bigframes.core.blocks
import bigframes.display.html

# anywidget and traitlets are optional dependencies. We don't want the import of this
Expand All @@ -45,8 +46,10 @@


class TableWidget(WIDGET_BASE):
"""
An interactive, paginated table widget for BigFrames DataFrames.
"""An interactive, paginated table widget for BigFrames DataFrames.

This widget provides a user-friendly way to display and navigate through
large BigQuery DataFrames within a Jupyter environment.
"""

def __init__(self, dataframe: bigframes.dataframe.DataFrame):
Expand All @@ -60,32 +63,37 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget."
)

super().__init__()
self._dataframe = dataframe
self._initializing = True
super().__init__()

# Initialize attributes that might be needed by observers FIRST
# Initialize attributes that might be needed by observers first
self._table_id = str(uuid.uuid4())
self._all_data_loaded = False
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
self._cached_batches: List[pd.DataFrame] = []

# respect display options for initial page size
# Respect display options for initial page size
initial_page_size = bigframes.options.display.max_rows

# Initialize data fetching attributes.
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
execute_result = dataframe._block.session._executor.execute(
dataframe._block.expr,
ordered=True,
use_explicit_destination=True,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we want to use an explicit destination here. This would result in creating a BigQuery job every time, which is not desirable. We want to allow the faster job optional code paths.

)

# set traitlets properties that trigger observers
self.page_size = initial_page_size
# The query issued by `to_pandas_batches()` already contains metadata
# about how many results there were. Use that to avoid doing an extra
# COUNT(*) query that `len(...)` would do.
self.row_count = execute_result.total_rows or 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you switch it to use the execute result instead of the PandasBatches object returned by to_pandas_batches?


# len(dataframe) is expensive, since it will trigger a
# SELECT COUNT(*) query. It is a must have however.
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
# before we get here so that the count might already be cached.
self.row_count = len(dataframe)
# Create pandas batches from the ExecuteResult
self._batches = execute_result.to_pandas_batches(page_size=initial_page_size)

self.page_size = initial_page_size

# get the initial page
self._set_table_html()
self._initializing = False

@functools.cached_property
def _esm(self):
Expand Down Expand Up @@ -167,8 +175,7 @@ def _get_next_batch(self) -> bool:
@property
def _batch_iterator(self) -> Iterator[pd.DataFrame]:
"""Lazily initializes and returns the batch iterator."""
if self._batch_iter is None:
self._batch_iter = iter(self._batches)
self._batch_iter = iter(self._batches)
return self._batch_iter

@property
Expand All @@ -180,7 +187,16 @@ def _cached_data(self) -> pd.DataFrame:

def _reset_batches_for_new_page_size(self):
"""Reset the batch iterator when page size changes."""
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
# Execute with explicit destination for consistency with __init__
execute_result = self._dataframe._block.session._executor.execute(
self._dataframe._block.expr,
ordered=True,
use_explicit_destination=True,
)

# Create pandas batches from the ExecuteResult
self._batches = execute_result.to_pandas_batches(page_size=self.page_size)

self._cached_batches = []
self._batch_iter = None
self._all_data_loaded = False
Expand Down Expand Up @@ -210,11 +226,15 @@ def _set_table_html(self):
@traitlets.observe("page")
def _page_changed(self, _change: Dict[str, Any]):
"""Handler for when the page number is changed from the frontend."""
if self._initializing:
return
self._set_table_html()

@traitlets.observe("page_size")
def _page_size_changed(self, _change: Dict[str, Any]):
"""Handler for when the page size is changed from the frontend."""
if self._initializing:
return
# Reset the page to 0 when page size changes to avoid invalid page states
self.page = 0

Expand Down
74 changes: 45 additions & 29 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,6 @@
"id": "f289d250",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a643d120-4af9-44fc-ba3c-ed461cf1092b&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -139,15 +127,27 @@
"id": "ce250157",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 1171b7b3-3f65-4165-a69d-69dad5a100d1 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1171b7b3-3f65-4165-a69d-69dad5a100d1&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635",
"model_id": "6b70bf0e30a04a3cab11e03b2ed80856",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"table table-striped table-ho…"
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"metadata": {},
Expand Down Expand Up @@ -183,6 +183,18 @@
"id": "6920d49b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 3100859b-c57c-42fe-a5fb-abb4f2f25db2 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:3100859b-c57c-42fe-a5fb-abb4f2f25db2&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -193,12 +205,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "121e3d2f28004036a922e3a11a08d4b7",
"model_id": "4714b0794f55435a8d3e136517158a5c",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"table table-striped table-ho…"
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"execution_count": 7,
Expand Down Expand Up @@ -273,10 +285,22 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
" warnings.warn(msg, bfe.AmbiguousWindowWarning)\n"
]
},
{
"data": {
"text/html": [
"Query job b4143f15-4bac-44a5-bb29-c5056f95b30b is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b4143f15-4bac-44a5-bb29-c5056f95b30b&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -287,12 +311,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ed335bbbc064e5391ea06a9a218642e",
"model_id": "c70b5611db6b4e6a806a16d0a8287cd3",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"table table-striped table-hover\" i…"
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
]
},
"execution_count": 9,
Expand All @@ -307,19 +331,11 @@
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
"small_widget"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -333,7 +349,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions notebooks/dataframes/dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5366,7 +5366,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -5380,7 +5380,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
26 changes: 21 additions & 5 deletions tests/benchmark/read_gbq_colab/aggregate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,14 @@ def aggregate_output(*, project_id, dataset_id, table_id):
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# To simulate very small rows that can only fit a boolean,
# some tables don't have an integer column. If an integer column is available,
Expand All @@ -42,9 +48,19 @@ def aggregate_output(*, project_id, dataset_id, table_id):
.groupby("rounded")
.sum(numeric_only=True)
)

df_aggregated.shape
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result_aggregated = df_aggregated._block.session._executor.execute(
df_aggregated._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert (
execute_result_aggregated.total_rows is not None
and execute_result_aggregated.total_rows >= 0
)
batches_aggregated = execute_result_aggregated.to_pandas_batches(
page_size=PAGE_SIZE
)
next(iter(batches_aggregated))


if __name__ == "__main__":
Expand Down
24 changes: 20 additions & 4 deletions tests/benchmark/read_gbq_colab/filter_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,32 @@ def filter_output(
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
# Force BigQuery execution to get total_rows metadata
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
rows, _ = df_filtered.shape
# Force BigQuery execution for filtered DataFrame to get total_rows metadata
execute_result_filtered = df_filtered._block.session._executor.execute(
df_filtered._block.expr,
ordered=True,
use_explicit_destination=True,
)

rows = execute_result_filtered.total_rows or 0
assert rows >= 0

batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE)

# It's possible we don't have any pages at all, since we filtered out all
# matching rows.
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
first_page = next(iter(batches_filtered))
assert len(first_page.index) <= rows


Expand Down
11 changes: 9 additions & 2 deletions tests/benchmark/read_gbq_colab/first_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,15 @@ def first_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and the first page.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
execute_result = df._block.session._executor.execute(
df._block.expr,
ordered=True,
use_explicit_destination=True,
)
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
first_page = next(iter(batches))
assert first_page is not None


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions tests/benchmark/read_gbq_colab/last_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and then all pages.
df.shape
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
for _ in batches:
pass


Expand Down
Loading