diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 000000000..b308b54fb --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,43 @@ +name: pytest + +on: [push] + +env: + PROJECT_ID: 'cal-itp-data-infra-staging' + WORKLOAD_IDENTITY_PROVIDER: 'projects/473674835135/locations/global/workloadIdentityPools/github-actions/providers/data-analyses' + SERVICE_ACCOUNT: 'github-actions-service-account@cal-itp-data-infra-staging.iam.gserviceaccount.com' + +jobs: + test: + name: Run tests + runs-on: ubuntu-latest + + permissions: + contents: read + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v5 + + - name: Authenticate Google Service Account + uses: google-github-actions/auth@v2 + with: + create_credentials_file: true + project_id: ${{ env.PROJECT_ID }} + workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ env.SERVICE_ACCOUNT }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install share_utils dependencies + working-directory: _shared_utils/ + run: pip install -r requirements.txt + + - name: Run shared_utils tests + working-directory: _shared_utils/ + run: pytest tests diff --git a/_shared_utils/requirements.txt b/_shared_utils/requirements.txt index 2b706a34b..2d1978cb6 100644 --- a/_shared_utils/requirements.txt +++ b/_shared_utils/requirements.txt @@ -1,10 +1,17 @@ -e . altair-transform==0.2.0 +calitp-data-analysis==2025.8.10 great_tables==0.16.1 +intake==0.6.4 +numba (>=0.62.1, <0.63.0) +numpy (>=1.26.4, <2.0.0) omegaconf==2.3.0 # better yaml configuration polars==1.22.0 pytest (>=8.4.1, <9.0.0) -quarto-cli==1.6.40 +pytest-mock (>=3.15.1, <4.0.0) +pytest-recording (>=0.13.4,<0.14.0) +pytest-unordered (>=0.7.0,<0.8.0) quarto==0.1.0 +quarto-cli==1.6.40 vegafusion==2.0.2 vl-convert-python>=1.6.0 diff --git a/_shared_utils/shared_utils/__init__.py b/_shared_utils/shared_utils/__init__.py index a0de7ba01..7fd4bac4f 100644 --- a/_shared_utils/shared_utils/__init__.py +++ b/_shared_utils/shared_utils/__init__.py @@ -1,27 +1,32 @@ -from . import ( - arcgis_query, - catalog_utils, - dask_utils, - geo_utils, - gtfs_utils_v2, - portfolio_utils, - publish_utils, - rt_dates, - rt_utils, - schedule_rt_utils, - time_helpers, -) +import sys -__all__ = [ - "arcgis_query", - "catalog_utils", - "dask_utils", - "geo_utils", - "gtfs_utils_v2", - "portfolio_utils", - "publish_utils", - "rt_dates", - "rt_utils", - "schedule_rt_utils", - "time_helpers", -] +if hasattr(sys, "_called_from_test"): + pass +else: + from . import ( + arcgis_query, + catalog_utils, + dask_utils, + geo_utils, + gtfs_utils_v2, + portfolio_utils, + publish_utils, + rt_dates, + rt_utils, + schedule_rt_utils, + time_helpers, + ) + + __all__ = [ + "arcgis_query", + "catalog_utils", + "dask_utils", + "geo_utils", + "gtfs_utils_v2", + "portfolio_utils", + "publish_utils", + "rt_dates", + "rt_utils", + "schedule_rt_utils", + "time_helpers", + ] diff --git a/_shared_utils/shared_utils/catalog_utils.py b/_shared_utils/shared_utils/catalog_utils.py index 1d543df10..1213daf02 100644 --- a/_shared_utils/shared_utils/catalog_utils.py +++ b/_shared_utils/shared_utils/catalog_utils.py @@ -11,11 +11,11 @@ shared_utils_directory = "data-analyses/_shared_utils/shared_utils/" -def get_catalog_file(catalog_name): +def get_catalog_file(catalog_name, home_path=Path.home(), current_path=Path.cwd()): filename = f"{shared_utils_directory}{catalog_name}.yml" - parent_directory = Path.cwd() + parent_directory = current_path - if Path.home() not in Path.cwd().parents: + if home_path not in current_path.parents: raise RuntimeError("The data-analyses repo should be located in your home directory.") while True: @@ -24,7 +24,7 @@ def get_catalog_file(catalog_name): if test_path.is_file(): return test_path - if parent_directory == Path.home(): + if parent_directory == home_path: raise FileNotFoundError(f"No such catalog file found: {filename}") parent_directory = parent_directory.parent diff --git a/_shared_utils/shared_utils/gtfs_utils_v2.py b/_shared_utils/shared_utils/gtfs_utils_v2.py index e04696c76..3acdab58f 100644 --- a/_shared_utils/shared_utils/gtfs_utils_v2.py +++ b/_shared_utils/shared_utils/gtfs_utils_v2.py @@ -443,18 +443,6 @@ def hour_tuple_to_seconds(hour_tuple: tuple[int]) -> tuple[int]: return (start_sec, end_sec) -def filter_start_end_ts(time_filters: dict, time_col: Literal["arrival", "departure"]) -> siuba.dply.verbs.Pipeable: - """ - For arrival or departure, grab the hours to subset and - convert the (start_hour, end_hour) tuple into seconds, - and return the siuba filter - """ - desired_hour_tuple = time_filters[time_col] - (start_sec, end_sec) = hour_tuple_to_seconds(desired_hour_tuple) - - return filter(_[f"{time_col}_sec"] >= start_sec, _[f"{time_col}_sec"] <= end_sec) - - def get_stop_times( selected_date: Union[str, datetime.date], operator_feeds: list[str] = [], diff --git a/_shared_utils/shared_utils/models/bridge_organizations_x_headquarters_county_geography.py b/_shared_utils/shared_utils/models/bridge_organizations_x_headquarters_county_geography.py new file mode 100644 index 000000000..845bba279 --- /dev/null +++ b/_shared_utils/shared_utils/models/bridge_organizations_x_headquarters_county_geography.py @@ -0,0 +1,16 @@ +from sqlalchemy import Boolean, Column, DateTime, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class BridgeOrganizationsXHeadquartersCountyGeography(Base): + __tablename__ = "bridge_organizations_x_headquarters_county_geography" + + organization_key = Column(String, primary_key=True) + county_geography_key = Column(String) + organization_name = Column(String) + county_geography_name = Column(String) + _valid_from = Column(DateTime) + _valid_to = Column(DateTime) + _is_current = Column(Boolean) diff --git a/_shared_utils/shared_utils/models/dim_county_geography.py b/_shared_utils/shared_utils/models/dim_county_geography.py new file mode 100644 index 000000000..d963922a5 --- /dev/null +++ b/_shared_utils/shared_utils/models/dim_county_geography.py @@ -0,0 +1,22 @@ +from sqlalchemy import Boolean, Column, DateTime, Integer, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class DimCountyGeography(Base): + __tablename__ = "dim_county_geography" + + key = Column(String, primary_key=True) + source_record_id = Column(String) + name = Column(String) + fips = Column(Integer) + msa = Column(String) + caltrans_district = Column(Integer) + caltrans_district_name = Column(String) + place_geography = Column(String) + organization_key = Column(String) + service_key = Column(String) + _is_current = Column(Boolean) + _valid_from = Column(DateTime) + _valid_to = Column(DateTime) diff --git a/_shared_utils/shared_utils/models/dim_gtfs_dataset.py b/_shared_utils/shared_utils/models/dim_gtfs_dataset.py new file mode 100644 index 000000000..4f809091e --- /dev/null +++ b/_shared_utils/shared_utils/models/dim_gtfs_dataset.py @@ -0,0 +1,32 @@ +from sqlalchemy import Boolean, Column, Date, DateTime, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class DimGtfsDataset(Base): + __tablename__ = "dim_gtfs_datasets" + + key = Column(String, primary_key=True) + source_record_id = Column(String) + name = Column(String) + type = Column(String) + regional_feed_type = Column(String) + backdated_regional_feed_type = Column(String) + uri = Column(String) + future_uri = Column(String) + deprecated_date = Column(Date) + data_quality_pipeline = Column(Boolean) + manual_check__link_to_dataset_on_website = Column(String) + manual_check__accurate_shapes = Column(String) + manual_check__data_license = Column(String) + manual_check__authentication_acceptable = Column(String) + manual_check__stable_url = Column(String) + manual_check__localized_stop_tts = Column(String) + manual_check__grading_scheme_v1 = Column(String) + base64_url = Column(String) + private_dataset = Column(Boolean) + analysis_name = Column(String) + _is_current = Column(Boolean) + _valid_from = Column(DateTime) + _valid_to = Column(DateTime) diff --git a/_shared_utils/shared_utils/models/dim_organization.py b/_shared_utils/shared_utils/models/dim_organization.py new file mode 100644 index 000000000..fc29b331e --- /dev/null +++ b/_shared_utils/shared_utils/models/dim_organization.py @@ -0,0 +1,37 @@ +from sqlalchemy import Boolean, Column, DateTime, Integer, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class DimOrganization(Base): + __tablename__ = "dim_organizations" + + key = Column(String, primary_key=True) + source_record_id = Column(String) + name = Column(String) + organization_type = Column(String) + roles = Column(String) + itp_id = Column(Integer) + details = Column(String) + website = Column(String) + reporting_category = Column(String) + hubspot_company_record_id = Column(String) + gtfs_static_status = Column(String) + gtfs_realtime_status = Column(String) + _deprecated__assessment_status = Column(Boolean) + manual_check__contact_on_website = Column(String) + alias = Column(String) + is_public_entity = Column(Boolean) + ntd_id = Column(String) + ntd_agency_info_key = Column(String) + ntd_id_2022 = Column(String) + rtpa_key = Column(String) + rtpa_name = Column(String) + mpo_key = Column(String) + mpo_name = Column(String) + public_currently_operating = Column(Boolean) + public_currently_operating_fixed_route = Column(Boolean) + _is_current = Column(Boolean) + _valid_from = Column(DateTime) + _valid_to = Column(DateTime) diff --git a/_shared_utils/shared_utils/models/dim_provider_gtfs_data.py b/_shared_utils/shared_utils/models/dim_provider_gtfs_data.py new file mode 100644 index 000000000..161c05ad9 --- /dev/null +++ b/_shared_utils/shared_utils/models/dim_provider_gtfs_data.py @@ -0,0 +1,39 @@ +from sqlalchemy import Boolean, Column, DateTime, Integer, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class DimProviderGtfsData(Base): + __tablename__ = "dim_provider_gtfs_data" + + key = Column(String, primary_key=True) + public_customer_facing_fixed_route = Column(Boolean) + public_customer_facing_or_regional_subfeed_fixed_route = Column(Boolean) + organization_key = Column(String) + organization_name = Column(String) + organization_itp_id = Column(Integer) + organization_hubspot_company_record_id = Column(String) + organization_ntd_id = Column(String) + organization_source_record_id = Column(String) + service_key = Column(String) + service_name = Column(String) + service_source_record_id = Column(String) + gtfs_service_data_customer_facing = Column(Boolean) + regional_feed_type = Column(String) + associated_schedule_gtfs_dataset_key = Column(String) + schedule_gtfs_dataset_name = Column(String) + schedule_source_record_id = Column(String) + service_alerts_gtfs_dataset_name = Column(String) + service_alerts_source_record_id = Column(String) + vehicle_positions_gtfs_dataset_name = Column(String) + vehicle_positions_source_record_id = Column(String) + trip_updates_gtfs_dataset_name = Column(String) + trip_updates_source_record_id = Column(String) + schedule_gtfs_dataset_key = Column(String) + service_alerts_gtfs_dataset_key = Column(String) + vehicle_positions_gtfs_dataset_key = Column(String) + trip_updates_gtfs_dataset_key = Column(String) + _valid_from = Column(DateTime) + _valid_to = Column(DateTime) + _is_current = Column(Boolean) diff --git a/_shared_utils/shared_utils/models/fct_daily_feed_scheduled_service_summary.py b/_shared_utils/shared_utils/models/fct_daily_feed_scheduled_service_summary.py new file mode 100644 index 000000000..44e075081 --- /dev/null +++ b/_shared_utils/shared_utils/models/fct_daily_feed_scheduled_service_summary.py @@ -0,0 +1,21 @@ +from sqlalchemy import Boolean, Column, DateTime, Float, Integer, String +from sqlalchemy.orm import declarative_base + +Base = declarative_base() + + +class FctDailyFeedScheduledServiceSummary(Base): + __tablename__ = "fct_daily_feed_scheduled_service_summary" + + service_date = Column(DateTime, primary_key=True) + feed_key = Column(String, primary_key=True) + gtfs_dataset_key = Column(String, primary_key=True) + ttl_service_hours = Column(Float) + n_trips = Column(Integer) + first_departure_sec = Column(Integer) + last_arrival_sec = Column(Integer) + num_stop_times = Column(Integer) + n_routes = Column(Integer) + contains_warning_duplicate_stop_times_primary_key = Column(Boolean) + contains_warning_duplicate_trip_primary_key = Column(Boolean) + contains_warning_missing_foreign_key_stop_id = Column(Boolean) diff --git a/_shared_utils/shared_utils/schedule_rt_utils.py b/_shared_utils/shared_utils/schedule_rt_utils.py index c7f9061d6..cc2c97124 100644 --- a/_shared_utils/shared_utils/schedule_rt_utils.py +++ b/_shared_utils/shared_utils/schedule_rt_utils.py @@ -2,18 +2,45 @@ Functions to bridge GTFS schedule and RT. """ +import os from typing import Literal, Union import dask.dataframe as dd import dask_geopandas as dg import geopandas as gpd import pandas as pd -import siuba # for type hints -from calitp_data_analysis.tables import tbls -from shared_utils import gtfs_utils_v2 -from siuba import * +from shared_utils.models.bridge_organizations_x_headquarters_county_geography import ( + BridgeOrganizationsXHeadquartersCountyGeography, +) +from shared_utils.models.dim_county_geography import DimCountyGeography +from shared_utils.models.dim_gtfs_dataset import DimGtfsDataset +from shared_utils.models.dim_organization import DimOrganization +from shared_utils.models.dim_provider_gtfs_data import DimProviderGtfsData +from shared_utils.models.fct_daily_feed_scheduled_service_summary import ( + FctDailyFeedScheduledServiceSummary, +) +from sqlalchemy import String, create_engine, select +from sqlalchemy.orm import Session +from sqlalchemy.sql.expression import and_, cast, func PACIFIC_TIMEZONE = "US/Pacific" +CALITP_BQ_MAX_BYTES = os.environ.get("CALITP_BQ_MAX_BYTES", 5_000_000_000) +CALITP_BQ_LOCATION = os.environ.get("CALITP_BQ_LOCATION", "us-west2") + + +def _get_engine(max_bytes=None, project="cal-itp-data-infra", dataset=None): + # TODO: update calitp_data_analysis.sql.get_engine to accept dataset arg + max_bytes = CALITP_BQ_MAX_BYTES if max_bytes is None else max_bytes + + cred_path = os.environ.get("CALITP_SERVICE_KEY_PATH") + + # Note that we should be able to add location as a uri parameter, but + # it is not being picked up, so passing as a separate argument for now. + return create_engine( + f"bigquery://{project}/{dataset if dataset else ''}?maximum_bytes_billed={max_bytes}", # noqa: E231 + location=CALITP_BQ_LOCATION, + credentials_path=cred_path, + ) def localize_timestamp_col(df: dd.DataFrame, timestamp_col: Union[str, list]) -> dd.DataFrame: @@ -43,48 +70,32 @@ def localize_timestamp_col(df: dd.DataFrame, timestamp_col: Union[str, list]) -> return df -def get_rt_schedule_feeds_crosswalk( - date: str, keep_cols: list, get_df: bool = True, custom_filtering: dict = None -) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]: - """ - Get fct_daily_rt_feeds, which provides the schedule_feed_key - to use when merging with schedule data. - """ - fct_rt_feeds = tbls.mart_gtfs.fct_daily_rt_feed_files() >> filter(_.date == date) - - if get_df: - fct_rt_feeds = ( - fct_rt_feeds - >> collect() - >> gtfs_utils_v2.filter_custom_col(custom_filtering) - >> gtfs_utils_v2.subset_cols(keep_cols) - ) - - return fct_rt_feeds >> gtfs_utils_v2.subset_cols(keep_cols) - - -def get_schedule_gtfs_dataset_key(date: str, get_df: bool = True) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]: +def get_schedule_gtfs_dataset_key(date: str, get_df: bool = True, **kwargs) -> pd.DataFrame: """ Use fct_daily_feed_scheduled_service to find the gtfs_dataset_key that corresponds to the feed_key. """ - schedule_feed_to_rt_key = ( - tbls.mart_gtfs.fct_daily_feed_scheduled_service_summary() - >> filter(_.service_date == date) - >> select(_.gtfs_dataset_key, _.feed_key) + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_gtfs") + + db_engine = _get_engine(project=project, dataset=dataset) + session = Session(db_engine) + statement = select(FctDailyFeedScheduledServiceSummary).where( + FctDailyFeedScheduledServiceSummary.service_date == date ) if get_df: - schedule_feed_to_rt_key = schedule_feed_to_rt_key >> collect() - - return schedule_feed_to_rt_key + return pd.read_sql(statement, session.bind) + else: + return session.scalars(statement) def filter_dim_gtfs_datasets( keep_cols: list[str] = ["key", "name", "type", "regional_feed_type", "uri", "base64_url"], custom_filtering: dict = None, get_df: bool = True, -) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]: + **kwargs, +) -> pd.DataFrame: """ Filter mart_transit_database.dim_gtfs_dataset table and keep only the valid rows that passed data quality checks. @@ -92,29 +103,40 @@ def filter_dim_gtfs_datasets( if "key" not in keep_cols: raise KeyError("Include key in keep_cols list") - dim_gtfs_datasets = ( - tbls.mart_transit_database.dim_gtfs_datasets() - >> filter(_.data_quality_pipeline == True) # if True, we can use - >> gtfs_utils_v2.filter_custom_col(custom_filtering) - >> gtfs_utils_v2.subset_cols(keep_cols) - ) + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_transit_database") - # rename columns to match our convention - if "key" in keep_cols: - dim_gtfs_datasets = dim_gtfs_datasets >> rename(gtfs_dataset_key="key") - if "name" in keep_cols: - dim_gtfs_datasets = dim_gtfs_datasets >> rename(gtfs_dataset_name="name") + db_engine = _get_engine(project=project, dataset=dataset) + session = Session(db_engine) - if get_df: - dim_gtfs_datasets = dim_gtfs_datasets >> collect() + dim_gtfs_dataset_columns = [] + + for column in keep_cols: + new_column = getattr(DimGtfsDataset, column) + + if column in ["key", "name"]: + new_column = new_column.label(f"gtfs_dataset_{column}") + + dim_gtfs_dataset_columns.append(new_column) + + search_conditions = [DimGtfsDataset.data_quality_pipeline == True] - return dim_gtfs_datasets + for k, v in (custom_filtering or {}).items(): + search_conditions.append(getattr(DimGtfsDataset, k).in_(v)) + + statement = select(*dim_gtfs_dataset_columns).where(and_(*search_conditions)) + + if get_df: + return pd.read_sql(statement, session.bind) + else: + return session.scalars(statement) def get_organization_id( df: pd.DataFrame, date: str, merge_cols: list = [], + **kwargs, ) -> pd.DataFrame: """ Instead of using the GTFS dataset name (of the quartet), usually @@ -144,14 +166,23 @@ def get_organization_id( "trip_updates, or service_alerts." ) else: - dim_provider_gtfs_data = tbls.mart_transit_database.dim_provider_gtfs_data() >> distinct() >> collect() - - dim_provider_gtfs_data = localize_timestamp_col(dim_provider_gtfs_data, ["_valid_from", "_valid_to"]) - - dim_provider_gtfs_data2 = dim_provider_gtfs_data >> filter( - _._valid_from_local <= pd.to_datetime(date), _._valid_to_local >= pd.to_datetime(date) + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_transit_database") + + db_engine = _get_engine(project=project, dataset=dataset) + session = Session(db_engine) + + statement = ( + select(DimProviderGtfsData) + .distinct() + .where( + and_(func.datetime(DimProviderGtfsData._valid_from, PACIFIC_TIMEZONE)) <= func.datetime(date), + func.datetime(DimProviderGtfsData._valid_to, PACIFIC_TIMEZONE) >= func.datetime(date), + ) ) + dim_provider_gtfs_data = pd.read_sql(statement, session.bind) + sorting = [True for c in merge_cols] keep_cols = ["organization_source_record_id"] @@ -159,18 +190,19 @@ def get_organization_id( # but we should handle it by selectig a preferred # rather than alphabetical. # (organization names Foothill Transit and City of Duarte) - dim_provider_gtfs_data2 = dim_provider_gtfs_data2.sort_values( + dim_provider_gtfs_data = dim_provider_gtfs_data.sort_values( merge_cols + ["_valid_to", "_valid_from"], ascending=sorting + [False, False] ).reset_index(drop=True)[merge_cols + keep_cols] - df2 = pd.merge(df, dim_provider_gtfs_data2, on=merge_cols, how="inner") + df2 = pd.merge(df, dim_provider_gtfs_data, on=merge_cols, how="inner") return df2 def filter_dim_county_geography( date: str, - keep_cols: list[str] = ["caltrans_district"], + keep_cols: list[str] = [], + **kwargs, ) -> pd.DataFrame: """ Merge mart_transit_database.dim_county_geography with @@ -183,65 +215,85 @@ def filter_dim_county_geography( Use this merge to get caltrans_district. Organizations belong to county, and counties are assigned to districts. """ - bridge_orgs_county_geog = ( - tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography() - >> gtfs_utils_v2.subset_cols([_.organization_name, _.county_geography_key, _._valid_from, _._valid_to]) - >> collect() - ) - - keep_cols2 = list(set(keep_cols + ["county_geography_key", "caltrans_district_name"])) - - dim_county_geography = ( - tbls.mart_transit_database.dim_county_geography() - >> rename(county_geography_key=_.key) - >> gtfs_utils_v2.subset_cols(keep_cols2) - >> collect() - ) - - bridge_orgs_county_geog = localize_timestamp_col(bridge_orgs_county_geog, ["_valid_from", "_valid_to"]) + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_transit_database") + + db_engine = _get_engine(project=project, dataset=dataset) + session = Session(db_engine) + + columns = [ + BridgeOrganizationsXHeadquartersCountyGeography.organization_name, + func.concat( + func.lpad(cast(DimCountyGeography.caltrans_district, String), 2, "0"), + " - ", + DimCountyGeography.caltrans_district_name, + ).label("caltrans_district"), + ] + + for column in keep_cols: + attribute = ( + getattr(BridgeOrganizationsXHeadquartersCountyGeography, column) + if hasattr(BridgeOrganizationsXHeadquartersCountyGeography, column) + else getattr(DimCountyGeography, column) + ) + columns.append(attribute) - bridge_orgs_county_geog2 = bridge_orgs_county_geog >> filter( - _._valid_from_local <= pd.to_datetime(date), _._valid_to_local >= pd.to_datetime(date) + statement = ( + select(*columns) + .join( + DimCountyGeography, + DimCountyGeography.key == BridgeOrganizationsXHeadquartersCountyGeography.county_geography_key, + ) + .where( + and_( + func.datetime(BridgeOrganizationsXHeadquartersCountyGeography._valid_from, PACIFIC_TIMEZONE) + <= func.datetime(date), + func.datetime(BridgeOrganizationsXHeadquartersCountyGeography._valid_to, PACIFIC_TIMEZONE) + >= func.datetime(date), + ) + ) ) - # Merge organization-county with caltrans_district info - # it appears to be a 1:1 merge. checked whether organization can belong to multiple districts, - # and that doesn't appear to happen - df = pd.merge(bridge_orgs_county_geog2, dim_county_geography, on="county_geography_key", how="inner") - - df2 = ( - df.assign(caltrans_district=df.caltrans_district.astype(str).str.zfill(2) + " - " + df.caltrans_district_name)[ - ["organization_name"] + keep_cols - ] - .drop_duplicates() - .reset_index(drop=True) - ) + df = pd.read_sql(statement, session.bind) - return df2 + return df[["organization_name", "caltrans_district"] + keep_cols].drop_duplicates().reset_index(drop=True) def filter_dim_organizations( - date: str, keep_cols: list[str] = ["source_record_id"], custom_filtering: dict = None, get_df: bool = True, -) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]: + **kwargs, +) -> pd.DataFrame: """ Filter dim_organizations down to current record for organization. Caltrans district is associated with organization_source_record_id. """ - dim_orgs = ( - tbls.mart_transit_database.dim_organizations() - >> gtfs_utils_v2.filter_custom_col(custom_filtering) - >> filter(_._is_current == True) - >> gtfs_utils_v2.subset_cols(keep_cols) - >> rename(organization_source_record_id="source_record_id") - ) + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_transit_database") - if get_df: - dim_orgs = dim_orgs >> collect() + db_engine = _get_engine(project=project, dataset=dataset) + session = Session(db_engine) + + dim_organization_columns = [] + + for column in keep_cols: + if column == "source_record_id": + dim_organization_columns.append(DimOrganization.source_record_id.label("organization_source_record_id")) + else: + dim_organization_columns.append(getattr(DimOrganization, column)) - return dim_orgs + search_conditions = [DimOrganization._is_current == True] + + for k, v in (custom_filtering or {}).items(): + search_conditions.append(getattr(DimOrganization, k).in_(v)) + + statement = select(*dim_organization_columns).where(and_(*search_conditions)) + + if get_df: + return pd.read_sql(statement, session.bind) + else: + return session.scalars(statement) def sample_gtfs_dataset_key_to_organization_crosswalk( @@ -259,12 +311,16 @@ def sample_gtfs_dataset_key_to_organization_crosswalk( ], dim_organization_cols: list[str] = ["source_record_id", "name"], dim_county_geography_cols: list[str] = ["caltrans_district"], + **kwargs, ) -> pd.DataFrame: """ Get crosswalk from gtfs_dataset_key to certain quartet data identifiers like base64_url, uri, and organization identifiers like organization_source_record_id and caltrans_district. """ + project = kwargs.get("project", "cal-itp-data-infra") + dataset = kwargs.get("dataset", "mart_transit_database") + id_cols = ["gtfs_dataset_key"] # If schedule feed_key is present, include it our crosswalk output @@ -276,7 +332,11 @@ def sample_gtfs_dataset_key_to_organization_crosswalk( # (1) Filter dim_gtfs_datasets by quartet and merge with the # gtfs_dataset_keys in df. dim_gtfs_datasets = filter_dim_gtfs_datasets( - keep_cols=dim_gtfs_dataset_cols, custom_filtering={"type": [quartet_data]}, get_df=True + keep_cols=dim_gtfs_dataset_cols, + custom_filtering={"type": [quartet_data]}, + get_df=True, + project=project, + dataset=dataset, ) feeds_with_quartet_info = pd.merge( @@ -298,50 +358,29 @@ def sample_gtfs_dataset_key_to_organization_crosswalk( # (3) From quartet, get to organization name merge_cols = [i for i in feeds_with_quartet_info.columns if quartet_data in i] - feeds_with_org_id = get_organization_id(feeds_with_quartet_info, date, merge_cols=merge_cols) + feeds_with_org_id = get_organization_id( + feeds_with_quartet_info, + date, + merge_cols=merge_cols, + project=project, + dataset=dataset, + ) # (4) Merge in dim_orgs to get organization info - everything except caltrans_district is found here ORG_RENAME_DICT = {"source_record_id": "organization_source_record_id", "name": "organization_name"} - orgs = filter_dim_organizations(date, keep_cols=dim_organization_cols, get_df=True).rename(columns=ORG_RENAME_DICT) + orgs = filter_dim_organizations( + keep_cols=dim_organization_cols, + get_df=True, + project=project, + dataset=dataset, + ).rename(columns=ORG_RENAME_DICT) feeds_with_org_info = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id") # (5) Merge in dim_county_geography to get caltrans_district # https://github.com/cal-itp/data-analyses/issues/1282 - district = filter_dim_county_geography(date, dim_county_geography_cols) + district = filter_dim_county_geography(date, dim_county_geography_cols, project=project, dataset=dataset) feeds_with_district = pd.merge(feeds_with_org_info, district, on="organization_name") return feeds_with_district - - -def sample_schedule_feed_key_to_organization_crosswalk( - df: pd.DataFrame, - date: str, - quartet_data: Literal["schedule", "vehicle_positions", "service_alerts", "trip_updates"] = "schedule", - **kwargs, -) -> pd.DataFrame: - """ - From schedule data, using feed_key as primary key, - grab the gtfs_dataset_key associated. - Pass this through function to attach quartet data identifier columns - and organization info. - """ - # Start with schedule feed_key, and grab gtfs_dataset_key associated - # with that feed_key - feeds = df[["feed_key"]].drop_duplicates().reset_index(drop=True) - - crosswalk_feed_to_gtfs_dataset_key = get_schedule_gtfs_dataset_key(date, get_df=True) - - feeds_with_gtfs_dataset_key = pd.merge( - feeds, - crosswalk_feed_to_gtfs_dataset_key, - on="feed_key", - how="inner", - ) - - feeds_with_district = sample_gtfs_dataset_key_to_organization_crosswalk( - feeds_with_gtfs_dataset_key, date, quartet_data=quartet_data, **kwargs - ) - - return feeds_with_district diff --git a/_shared_utils/tests/conftest.py b/_shared_utils/tests/conftest.py new file mode 100644 index 000000000..387416c81 --- /dev/null +++ b/_shared_utils/tests/conftest.py @@ -0,0 +1,31 @@ +import sys + +import pytest + + +def pytest_configure(config): + sys._called_from_test = True + + +def pytest_unconfigure(config): + del sys._called_from_test + + +@pytest.fixture(scope="module") +def vcr_config(): + return { + "filter_headers": [ + ("cookie", "FILTERED"), + ("Authorization", "FILTERED"), + ("apikey", "FILTERED"), + ("X-CKAN-API-Key", "FILTERED"), + ], + "ignore_hosts": [ + "run-actions-1-azure-eastus.actions.githubusercontent.com", + "run-actions-2-azure-eastus.actions.githubusercontent.com", + "run-actions-3-azure-eastus.actions.githubusercontent.com", + "sts.googleapis.com", + "iamcredentials.googleapis.com", + "oauth2.googleapis.com", + ], + } diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography.yaml new file mode 100644 index 000000000..49c3a545d --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography.yaml @@ -0,0 +1,75 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "INT64"}, "parameterValue": {"value": "2"}, "name": "lpad_1"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "0"}, "name": "lpad_2"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": " - "}, "name": "concat_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "US/Pacific"}, + "name": "datetime_1"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "2025-06-17"}, "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "2025-06-17"}, "name": "datetime_4"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `bridge_organizations_x_headquarters_county_geography`.`organization_name`, + concat(lpad(CAST(`dim_county_geography`.`caltrans_district` AS STRING), @`lpad_1`, + @`lpad_2`), @`concat_1`, `dim_county_geography`.`caltrans_district_name`) AS + `caltrans_district` \nFROM `bridge_organizations_x_headquarters_county_geography` + JOIN `dim_county_geography` ON `dim_county_geography`.`key` = `bridge_organizations_x_headquarters_county_geography`.`county_geography_key` + \nWHERE datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_from`, + @`datetime_1`) <= datetime(@`datetime_2`) AND datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_to`, + @`datetime_3`) >= datetime(@`datetime_4`)", "maxResults": 5000, "requestId": + "d01eb459-9309-45f6-9308-4d02af777609"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1767' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_bc7RmPR6Yp3WcOaEAKCeRO1m58ac","location":"us-west2"},"totalRows":"2","rows":[{"f":[{"v":"City + of Rosemead"},{"v":"07 - Los Angeles / Ventura"}]},{"f":[{"v":"Via / Remix + Inc."},{"v":"07 - Los Angeles / Ventura"}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_bc7RmPR6Yp3WcOaEAKCeRO1m58ac","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762219677769","startTime":"1762219677879","endTime":"1762219677950"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:27:58 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_additional_keep_cols.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_additional_keep_cols.yaml new file mode 100644 index 000000000..39e18c4e9 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_additional_keep_cols.yaml @@ -0,0 +1,79 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "INT64"}, "parameterValue": {"value": "2"}, "name": "lpad_1"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "0"}, "name": "lpad_2"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": " - "}, "name": "concat_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "US/Pacific"}, + "name": "datetime_1"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "2025-06-17"}, "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "2025-06-17"}, "name": "datetime_4"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `bridge_organizations_x_headquarters_county_geography`.`organization_name`, + concat(lpad(CAST(`dim_county_geography`.`caltrans_district` AS STRING), @`lpad_1`, + @`lpad_2`), @`concat_1`, `dim_county_geography`.`caltrans_district_name`) AS + `caltrans_district`, `dim_county_geography`.`caltrans_district` AS `caltrans_district_1`, + `bridge_organizations_x_headquarters_county_geography`.`county_geography_name`, + `dim_county_geography`.`msa`, `dim_county_geography`.`fips` \nFROM `bridge_organizations_x_headquarters_county_geography` + JOIN `dim_county_geography` ON `dim_county_geography`.`key` = `bridge_organizations_x_headquarters_county_geography`.`county_geography_key` + \nWHERE datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_from`, + @`datetime_1`) <= datetime(@`datetime_2`) AND datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_to`, + @`datetime_3`) >= datetime(@`datetime_4`)", "maxResults": 5000, "requestId": + "12ce61d3-76d2-4f87-b74a-ad3290ebb1e9"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1977' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district_1","type":"INTEGER","mode":"NULLABLE"},{"name":"county_geography_name","type":"STRING","mode":"NULLABLE"},{"name":"msa","type":"STRING","mode":"NULLABLE"},{"name":"fips","type":"INTEGER","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_RXrknnhkzPDbVcV6eFboEiJkIq5V","location":"us-west2"},"totalRows":"2","rows":[{"f":[{"v":"City + of Rosemead"},{"v":"07 - Los Angeles / Ventura"},{"v":"7"},{"v":"Los Angeles"},{"v":"Los + Angeles-Long Beach-Anaheim"},{"v":"6037"}]},{"f":[{"v":"Via / Remix Inc."},{"v":"07 + - Los Angeles / Ventura"},{"v":"7"},{"v":"Los Angeles"},{"v":"Los Angeles-Long + Beach-Anaheim"},{"v":"6037"}]}],"totalBytesProcessed":"359","jobComplete":true,"cacheHit":false,"queryId":"job_RXrknnhkzPDbVcV6eFboEiJkIq5V","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"20971520","totalSlotMs":"22","location":"us-west2","creationTime":"1762219919256","startTime":"1762219919358","endTime":"1762219919489"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:31:59 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_date_unavailable.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_date_unavailable.yaml new file mode 100644 index 000000000..053634ed9 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_county_geography_date_unavailable.yaml @@ -0,0 +1,73 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "INT64"}, "parameterValue": {"value": "2"}, "name": "lpad_1"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "0"}, "name": "lpad_2"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": " - "}, "name": "concat_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "US/Pacific"}, + "name": "datetime_1"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "2024-01-17"}, "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "2024-01-17"}, "name": "datetime_4"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `bridge_organizations_x_headquarters_county_geography`.`organization_name`, + concat(lpad(CAST(`dim_county_geography`.`caltrans_district` AS STRING), @`lpad_1`, + @`lpad_2`), @`concat_1`, `dim_county_geography`.`caltrans_district_name`) AS + `caltrans_district` \nFROM `bridge_organizations_x_headquarters_county_geography` + JOIN `dim_county_geography` ON `dim_county_geography`.`key` = `bridge_organizations_x_headquarters_county_geography`.`county_geography_key` + \nWHERE datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_from`, + @`datetime_1`) <= datetime(@`datetime_2`) AND datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_to`, + @`datetime_3`) >= datetime(@`datetime_4`)", "maxResults": 5000, "requestId": + "c99f0f70-db22-48a0-a352-b16913cb5e2e"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1767' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_7LiSDgWwoHfoPVnJwfr5Zs3h0JCY","location":"us-west2"},"totalRows":"0","totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_7LiSDgWwoHfoPVnJwfr5Zs3h0JCY","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762219678648","startTime":"1762219678725","endTime":"1762219678791"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:27:58 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets.yaml new file mode 100644 index 000000000..cfdc3a50d --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets.yaml @@ -0,0 +1,64 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_gtfs_datasets`.`key` AS `gtfs_dataset_key`, `dim_gtfs_datasets`.`name` + AS `gtfs_dataset_name`, `dim_gtfs_datasets`.`type`, `dim_gtfs_datasets`.`regional_feed_type`, + `dim_gtfs_datasets`.`uri`, `dim_gtfs_datasets`.`base64_url` \nFROM `dim_gtfs_datasets` + \nWHERE `dim_gtfs_datasets`.`data_quality_pipeline` = true", "maxResults": 5000, + "requestId": "8af8972b-2335-491e-a071-60baea7b97e1"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '657' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"type","type":"STRING","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"uri","type":"STRING","mode":"NULLABLE"},{"name":"base64_url","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_1LIZyHBhK53ZBb5VvHXfVIKZkWbb","location":"us-west2"},"totalRows":"3","rows":[{"f":[{"v":"c51dbfdd47838f86074c4ef3179cc9ed"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"schedule"},{"v":null},{"v":"http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0"},{"v":"aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA=="}]},{"f":[{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"schedule"},{"v":null},{"v":"http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0"},{"v":"aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA=="}]},{"f":[{"v":"8aed0709366badf9342e03d0a2d72b8d"},{"v":"SLO + Trip Updates"},{"v":"trip_updates"},{"v":null},{"v":"http://data.peaktransit.com/gtfsrt/1/TripUpdate.pb"},{"v":"aHR0cDovL2RhdGEucGVha3RyYW5zaXQuY29tL2d0ZnNydC8xL1RyaXBVcGRhdGUucGI="}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_1LIZyHBhK53ZBb5VvHXfVIKZkWbb","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762208243784","startTime":"1762208243857","endTime":"1762208243921"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Mon, 03 Nov 2025 22:17:23 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_custom_filtering.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_custom_filtering.yaml new file mode 100644 index 000000000..3f9a9ccc4 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_custom_filtering.yaml @@ -0,0 +1,65 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "ARRAY", "arrayType": {"type": "STRING"}}, "parameterValue": {"arrayValues": + [{"value": "trip_updates"}]}, "name": "type_1"}], "parameterMode": "NAMED", + "useLegacySql": false, "formatOptions": {"useInt64Timestamp": true}, "location": + "us-west2", "query": "SELECT `dim_gtfs_datasets`.`key` AS `gtfs_dataset_key`, + `dim_gtfs_datasets`.`name` AS `gtfs_dataset_name`, `dim_gtfs_datasets`.`type`, + `dim_gtfs_datasets`.`regional_feed_type`, `dim_gtfs_datasets`.`uri`, `dim_gtfs_datasets`.`base64_url` + \nFROM `dim_gtfs_datasets` \nWHERE `dim_gtfs_datasets`.`data_quality_pipeline` + = true AND `dim_gtfs_datasets`.`type` IN UNNEST(@`type_1`)", "maxResults": 5000, + "requestId": "b0b5231e-bc39-4250-93e3-f5ae201111d0"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '886' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"type","type":"STRING","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"uri","type":"STRING","mode":"NULLABLE"},{"name":"base64_url","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_ebnHMg1M4OyzE1wkL7gl0T91nGWH","location":"us-west2"},"totalRows":"1","rows":[{"f":[{"v":"8aed0709366badf9342e03d0a2d72b8d"},{"v":"SLO + Trip Updates"},{"v":"trip_updates"},{"v":null},{"v":"http://data.peaktransit.com/gtfsrt/1/TripUpdate.pb"},{"v":"aHR0cDovL2RhdGEucGVha3RyYW5zaXQuY29tL2d0ZnNydC8xL1RyaXBVcGRhdGUucGI="}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_ebnHMg1M4OyzE1wkL7gl0T91nGWH","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762208245263","startTime":"1762208245316","endTime":"1762208245373"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Mon, 03 Nov 2025 22:17:25 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_keep_cols_subset.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_keep_cols_subset.yaml new file mode 100644 index 000000000..7b42e3fec --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_gtfs_datasets_keep_cols_subset.yaml @@ -0,0 +1,62 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_gtfs_datasets`.`key` AS `gtfs_dataset_key`, `dim_gtfs_datasets`.`name` + AS `gtfs_dataset_name` \nFROM `dim_gtfs_datasets` \nWHERE `dim_gtfs_datasets`.`data_quality_pipeline` + = true", "maxResults": 5000, "requestId": "47029e4f-7838-41c6-86df-93740b07550e"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '526' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_name","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_AUVWd6Fm-JQ-Frp2tcUuXjtan3wI","location":"us-west2"},"totalRows":"3","rows":[{"f":[{"v":"c51dbfdd47838f86074c4ef3179cc9ed"},{"v":"Santa + Ynez Mecatran Schedule"}]},{"f":[{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":"Santa + Ynez Mecatran Schedule"}]},{"f":[{"v":"8aed0709366badf9342e03d0a2d72b8d"},{"v":"SLO + Trip Updates"}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_AUVWd6Fm-JQ-Frp2tcUuXjtan3wI","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762208244536","startTime":"1762208244592","endTime":"1762208244659"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Mon, 03 Nov 2025 22:17:24 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations.yaml new file mode 100644 index 000000000..5a69b849d --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations.yaml @@ -0,0 +1,59 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_organizations`.`source_record_id` AS `organization_source_record_id` + \nFROM `dim_organizations` \nWHERE `dim_organizations`.`_is_current` = true", + "maxResults": 5000, "requestId": "fdea5969-f3fd-4e48-a6c2-efc4ae7141a3"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '491' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_RI7jjD6r5zHgRoMbU9az15QDP9Aw","location":"us-west2"},"totalRows":"3","rows":[{"f":[{"v":"reckGS8egMZryjbX7"}]},{"f":[{"v":"recyqZ1zbZMkeA7Vf"}]},{"f":[{"v":"recOT4QO6t6mRhUEu"}]}],"totalBytesProcessed":"60","jobComplete":true,"cacheHit":false,"queryId":"job_RI7jjD6r5zHgRoMbU9az15QDP9Aw","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"19","location":"us-west2","creationTime":"1762221451925","startTime":"1762221451976","endTime":"1762221452093"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:57:32 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_additional_keep_cols.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_additional_keep_cols.yaml new file mode 100644 index 000000000..d9a2c885d --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_additional_keep_cols.yaml @@ -0,0 +1,62 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_organizations`.`key`, `dim_organizations`.`name`, `dim_organizations`.`organization_type` + \nFROM `dim_organizations` \nWHERE `dim_organizations`.`_is_current` = true", + "maxResults": 5000, "requestId": "8a36dcec-7885-458c-9d74-694ae378ae54"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '512' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"key","type":"STRING","mode":"NULLABLE"},{"name":"name","type":"STRING","mode":"NULLABLE"},{"name":"organization_type","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job__yhodyOpJAjm7sfj52CyteNx3vCz","location":"us-west2"},"totalRows":"3","rows":[{"f":[{"v":"35448956533b3ff4f8c9cf4e7886c974"},{"v":"City + of Mission Viejo"},{"v":"City/Town"}]},{"f":[{"v":"02a0e06b1ddb80e5695fc82fcc0c3ccc"},{"v":"City + of Patterson"},{"v":"City/Town"}]},{"f":[{"v":"4cb90bb76f9cd9472a2df6dd9014b4fa"},{"v":"City + of Chula Vista"},{"v":"City/Town"}]}],"totalBytesProcessed":"201","jobComplete":true,"cacheHit":false,"queryId":"job__yhodyOpJAjm7sfj52CyteNx3vCz","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"16","location":"us-west2","creationTime":"1762221452731","startTime":"1762221452806","endTime":"1762221452921"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:57:32 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_custom_filtering.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_custom_filtering.yaml new file mode 100644 index 000000000..7142a7314 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_filter_dim_organizations_custom_filtering.yaml @@ -0,0 +1,63 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "ARRAY", "arrayType": {"type": "STRING"}}, "parameterValue": {"arrayValues": + [{"value": "City of Mission Viejo"}, {"value": "City of Patterson"}]}, "name": + "name_1"}], "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": + {"useInt64Timestamp": true}, "location": "us-west2", "query": "SELECT `dim_organizations`.`name`, + `dim_organizations`.`source_record_id` AS `organization_source_record_id` \nFROM + `dim_organizations` \nWHERE `dim_organizations`.`_is_current` = true AND `dim_organizations`.`name` + IN UNNEST(@`name_1`)", "maxResults": 5000, "requestId": "5297ac44-0c46-4450-8162-c285ea4c4947"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '789' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"name","type":"STRING","mode":"NULLABLE"},{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_zx3xRarRZLSXItczfig2bXvKAmlZ","location":"us-west2"},"totalRows":"2","rows":[{"f":[{"v":"City + of Mission Viejo"},{"v":"reckGS8egMZryjbX7"}]},{"f":[{"v":"City of Patterson"},{"v":"recyqZ1zbZMkeA7Vf"}]}],"totalBytesProcessed":"123","jobComplete":true,"cacheHit":false,"queryId":"job_zx3xRarRZLSXItczfig2bXvKAmlZ","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"19","location":"us-west2","creationTime":"1762221453625","startTime":"1762221453683","endTime":"1762221453824"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 01:57:33 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_organization_id_no_merge_cols.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_organization_id_no_merge_cols.yaml new file mode 100644 index 000000000..941e8f29a --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_organization_id_no_merge_cols.yaml @@ -0,0 +1,86 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "STRING"}, "parameterValue": {"value": "US/Pacific"}, "name": "datetime_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "2025-10-12"}, + "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "2025-10-12"}, "name": "datetime_4"}], "parameterMode": + "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": true}, + "location": "us-west2", "query": "SELECT DISTINCT `dim_provider_gtfs_data`.`key`, + `dim_provider_gtfs_data`.`public_customer_facing_fixed_route`, `dim_provider_gtfs_data`.`public_customer_facing_or_regional_subfeed_fixed_route`, + `dim_provider_gtfs_data`.`organization_key`, `dim_provider_gtfs_data`.`organization_name`, + `dim_provider_gtfs_data`.`organization_itp_id`, `dim_provider_gtfs_data`.`organization_hubspot_company_record_id`, + `dim_provider_gtfs_data`.`organization_ntd_id`, `dim_provider_gtfs_data`.`organization_source_record_id`, + `dim_provider_gtfs_data`.`service_key`, `dim_provider_gtfs_data`.`service_name`, + `dim_provider_gtfs_data`.`service_source_record_id`, `dim_provider_gtfs_data`.`gtfs_service_data_customer_facing`, + `dim_provider_gtfs_data`.`regional_feed_type`, `dim_provider_gtfs_data`.`associated_schedule_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_name`, `dim_provider_gtfs_data`.`schedule_source_record_id`, + `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_name`, `dim_provider_gtfs_data`.`service_alerts_source_record_id`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_name`, `dim_provider_gtfs_data`.`vehicle_positions_source_record_id`, + `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_name`, `dim_provider_gtfs_data`.`trip_updates_source_record_id`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_key`, `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_key`, `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`_valid_from`, `dim_provider_gtfs_data`.`_valid_to`, + `dim_provider_gtfs_data`.`_is_current` \nFROM `dim_provider_gtfs_data` \nWHERE + datetime(`dim_provider_gtfs_data`.`_valid_from`, @`datetime_1`) <= datetime(@`datetime_2`) + AND datetime(`dim_provider_gtfs_data`.`_valid_to`, @`datetime_3`) >= datetime(@`datetime_4`)", + "maxResults": 5000, "requestId": "639e8a6a-cb45-4231-9caf-2b3d80da4204"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '2638' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"key","type":"STRING","mode":"NULLABLE"},{"name":"public_customer_facing_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"public_customer_facing_or_regional_subfeed_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"organization_key","type":"STRING","mode":"NULLABLE"},{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"organization_itp_id","type":"INTEGER","mode":"NULLABLE"},{"name":"organization_hubspot_company_record_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_ntd_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_key","type":"STRING","mode":"NULLABLE"},{"name":"service_name","type":"STRING","mode":"NULLABLE"},{"name":"service_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_service_data_customer_facing","type":"BOOLEAN","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"associated_schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"schedule_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"_valid_from","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_valid_to","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_is_current","type":"BOOLEAN","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_nh9hbc4iSOMEZAZ_GT83CLIJgWwj","location":"us-west2"},"totalRows":"3","rows":[{"f":[{"v":"145279bf2bb891a6f43abd24403a46d7"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"Rosemead + Passio Schedule"},{"v":"reccLvZxTMz1FuNii"},{"v":"Rosemead Passio Alerts"},{"v":"recm7XryAyH4sP0ZF"},{"v":"Rosemead + Passio Vehicle Positions"},{"v":"rec1RocbLnc4A9eE4"},{"v":"Rosemead Passio + Trip Updates"},{"v":"recQtNJAv4UyQjMh4"},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"c766db2bb5dd244b0e2176f4dfc35999"},{"v":"ad67eda06e383bbd5c174eab700915fb"},{"v":"70b85693df485b227c4a1a87ac46d065"},{"v":"1753401600000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"5beb8a6bf200cfdf00da66ac1521d526"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":"Ridgecrest + Schedule"},{"v":"recb7v2QiflfKM77w"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":null},{"v":null},{"v":null},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"90ebcb3f9a649985be20bc10aa3ae3d4"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"Avalon + Schedule Historic"},{"v":"recdcpS2s3wuT3QAG"},{"v":"Avalon Alerts Historic"},{"v":"recgHXDRuyrKbOD47"},{"v":"Avalon + Vehicle Positions"},{"v":"recvwGOdqwtvPzKhZ"},{"v":"Avalon Trip Updates"},{"v":"recaDdOPYGgkoCOoT"},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"688fc08edb6dbba6ce92a11b36929993"},{"v":"66f7f7dfc9290a80d99b5b2e645d8659"},{"v":"b070831f700176fc88e2464cb353c24e"},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]}],"totalBytesProcessed":"1407","jobComplete":true,"cacheHit":false,"queryId":"job_nh9hbc4iSOMEZAZ_GT83CLIJgWwj","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"26","location":"us-west2","creationTime":"1762216517074","startTime":"1762216517169","endTime":"1762216517337"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 00:35:17 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_schedule_gtfs_dataset_key.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_schedule_gtfs_dataset_key.yaml new file mode 100644 index 000000000..aa7c0b1b6 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_get_schedule_gtfs_dataset_key.yaml @@ -0,0 +1,67 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "STRING"}, "parameterValue": {"value": "2025-09-01"}, "name": "service_date_1"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `fct_daily_feed_scheduled_service_summary`.`service_date`, + `fct_daily_feed_scheduled_service_summary`.`feed_key`, `fct_daily_feed_scheduled_service_summary`.`gtfs_dataset_key`, + `fct_daily_feed_scheduled_service_summary`.`ttl_service_hours`, `fct_daily_feed_scheduled_service_summary`.`n_trips`, + `fct_daily_feed_scheduled_service_summary`.`first_departure_sec`, `fct_daily_feed_scheduled_service_summary`.`last_arrival_sec`, + `fct_daily_feed_scheduled_service_summary`.`num_stop_times`, `fct_daily_feed_scheduled_service_summary`.`n_routes`, + `fct_daily_feed_scheduled_service_summary`.`contains_warning_duplicate_stop_times_primary_key`, + `fct_daily_feed_scheduled_service_summary`.`contains_warning_duplicate_trip_primary_key`, + `fct_daily_feed_scheduled_service_summary`.`contains_warning_missing_foreign_key_stop_id` + \nFROM `fct_daily_feed_scheduled_service_summary` \nWHERE `fct_daily_feed_scheduled_service_summary`.`service_date` + = @`service_date_1`", "maxResults": 5000, "requestId": "9953e7d0-1be5-4c33-85c3-905c2b766bba"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1425' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"service_date","type":"DATETIME","mode":"NULLABLE"},{"name":"feed_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"ttl_service_hours","type":"FLOAT","mode":"NULLABLE"},{"name":"n_trips","type":"INTEGER","mode":"NULLABLE"},{"name":"first_departure_sec","type":"INTEGER","mode":"NULLABLE"},{"name":"last_arrival_sec","type":"INTEGER","mode":"NULLABLE"},{"name":"num_stop_times","type":"INTEGER","mode":"NULLABLE"},{"name":"n_routes","type":"INTEGER","mode":"NULLABLE"},{"name":"contains_warning_duplicate_stop_times_primary_key","type":"BOOLEAN","mode":"NULLABLE"},{"name":"contains_warning_duplicate_trip_primary_key","type":"BOOLEAN","mode":"NULLABLE"},{"name":"contains_warning_missing_foreign_key_stop_id","type":"BOOLEAN","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_mN8R7pcY7vH1oyXjRt2lNWDsuTI-","location":"us-west2"},"totalRows":"1","rows":[{"f":[{"v":"2025-09-01T00:00:00"},{"v":"10f6bb537140b7e52c8f313f3d611f71"},{"v":"0089bd1b0a2b78a8590d8749737d7146"},{"v":"0.0"},{"v":"0"},{"v":null},{"v":null},{"v":"0"},{"v":"0"},{"v":null},{"v":null},{"v":null}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_mN8R7pcY7vH1oyXjRt2lNWDsuTI-","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1761955815437","startTime":"1761955815531","endTime":"1761955815593"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Sat, 01 Nov 2025 00:10:15 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk.yaml new file mode 100644 index 000000000..902aefc11 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk.yaml @@ -0,0 +1,288 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "ARRAY", "arrayType": {"type": "STRING"}}, "parameterValue": {"arrayValues": + [{"value": "schedule"}]}, "name": "type_1"}], "parameterMode": "NAMED", "useLegacySql": + false, "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", + "query": "SELECT `dim_gtfs_datasets`.`key` AS `gtfs_dataset_key`, `dim_gtfs_datasets`.`name` + AS `gtfs_dataset_name`, `dim_gtfs_datasets`.`type`, `dim_gtfs_datasets`.`source_record_id`, + `dim_gtfs_datasets`.`regional_feed_type`, `dim_gtfs_datasets`.`base64_url`, + `dim_gtfs_datasets`.`uri` \nFROM `dim_gtfs_datasets` \nWHERE `dim_gtfs_datasets`.`data_quality_pipeline` + = true AND `dim_gtfs_datasets`.`type` IN UNNEST(@`type_1`)", "maxResults": 5000, + "requestId": "c9163231-4621-448f-a8ea-a05224a02dd2"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '922' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"type","type":"STRING","mode":"NULLABLE"},{"name":"source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"base64_url","type":"STRING","mode":"NULLABLE"},{"name":"uri","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_4lRh0Z1QCHP3iIGH4vZyqKfYIeSg","location":"us-west2"},"totalRows":"2","rows":[{"f":[{"v":"c51dbfdd47838f86074c4ef3179cc9ed"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"schedule"},{"v":"recuWhPXfxMatv6rL"},{"v":null},{"v":"aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA=="},{"v":"http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0"}]},{"f":[{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"schedule"},{"v":"recuWhPXfxMatv6rL"},{"v":null},{"v":"aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA=="},{"v":"http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0"}]}],"totalBytesProcessed":"1100","jobComplete":true,"cacheHit":false,"queryId":"job_4lRh0Z1QCHP3iIGH4vZyqKfYIeSg","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"52","location":"us-west2","creationTime":"1762222189258","startTime":"1762222189333","endTime":"1762222189537"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:09:49 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "STRING"}, "parameterValue": {"value": "US/Pacific"}, "name": "datetime_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "2025-10-16"}, + "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "2025-10-16"}, "name": "datetime_4"}], "parameterMode": + "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": true}, + "location": "us-west2", "query": "SELECT DISTINCT `dim_provider_gtfs_data`.`key`, + `dim_provider_gtfs_data`.`public_customer_facing_fixed_route`, `dim_provider_gtfs_data`.`public_customer_facing_or_regional_subfeed_fixed_route`, + `dim_provider_gtfs_data`.`organization_key`, `dim_provider_gtfs_data`.`organization_name`, + `dim_provider_gtfs_data`.`organization_itp_id`, `dim_provider_gtfs_data`.`organization_hubspot_company_record_id`, + `dim_provider_gtfs_data`.`organization_ntd_id`, `dim_provider_gtfs_data`.`organization_source_record_id`, + `dim_provider_gtfs_data`.`service_key`, `dim_provider_gtfs_data`.`service_name`, + `dim_provider_gtfs_data`.`service_source_record_id`, `dim_provider_gtfs_data`.`gtfs_service_data_customer_facing`, + `dim_provider_gtfs_data`.`regional_feed_type`, `dim_provider_gtfs_data`.`associated_schedule_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_name`, `dim_provider_gtfs_data`.`schedule_source_record_id`, + `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_name`, `dim_provider_gtfs_data`.`service_alerts_source_record_id`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_name`, `dim_provider_gtfs_data`.`vehicle_positions_source_record_id`, + `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_name`, `dim_provider_gtfs_data`.`trip_updates_source_record_id`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_key`, `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_key`, `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`_valid_from`, `dim_provider_gtfs_data`.`_valid_to`, + `dim_provider_gtfs_data`.`_is_current` \nFROM `dim_provider_gtfs_data` \nWHERE + datetime(`dim_provider_gtfs_data`.`_valid_from`, @`datetime_1`) <= datetime(@`datetime_2`) + AND datetime(`dim_provider_gtfs_data`.`_valid_to`, @`datetime_3`) >= datetime(@`datetime_4`)", + "maxResults": 5000, "requestId": "d2f84226-bf07-4e0e-9b43-1990f717e60f"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '2638' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"key","type":"STRING","mode":"NULLABLE"},{"name":"public_customer_facing_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"public_customer_facing_or_regional_subfeed_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"organization_key","type":"STRING","mode":"NULLABLE"},{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"organization_itp_id","type":"INTEGER","mode":"NULLABLE"},{"name":"organization_hubspot_company_record_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_ntd_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_key","type":"STRING","mode":"NULLABLE"},{"name":"service_name","type":"STRING","mode":"NULLABLE"},{"name":"service_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_service_data_customer_facing","type":"BOOLEAN","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"associated_schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"schedule_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"_valid_from","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_valid_to","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_is_current","type":"BOOLEAN","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_PLfR612j0eM9NCuGF5MAADgfil7_","location":"us-west2"},"totalRows":"4","rows":[{"f":[{"v":"57321774eb04c7c650ef4709fb25a2c8"},{"v":"true"},{"v":"true"},{"v":"624c361fd9a2a9ad100cbf4b729c2df9"},{"v":"City + of Solvang"},{"v":"312"},{"v":"8719942682"},{"v":"9R02-91028"},{"v":"reckp33bhAuZlmO1M"},{"v":"55e1099a8610904b4479be54b36875d9"},{"v":"Santa + Ynez Valley Transit"},{"v":"recTlfGUkpnZ8cICb"},{"v":"true"},{"v":null},{"v":"bb227dec4b51d9d438125e38d0214784"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"recuWhPXfxMatv6rL"},{"v":null},{"v":null},{"v":"Santa + Ynez Vehicle Positions"},{"v":"reccgBgsKC5J7BD8K"},{"v":null},{"v":null},{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":null},{"v":"40e0488e732e680cc9d7002fad71adad"},{"v":null},{"v":"1760572800000000"},{"v":"1761264000000000"},{"v":"true"}]},{"f":[{"v":"5beb8a6bf200cfdf00da66ac1521d526"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":"Ridgecrest + Schedule"},{"v":"recb7v2QiflfKM77w"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":null},{"v":null},{"v":null},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"145279bf2bb891a6f43abd24403a46d7"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"Rosemead + Passio Schedule"},{"v":"reccLvZxTMz1FuNii"},{"v":"Rosemead Passio Alerts"},{"v":"recm7XryAyH4sP0ZF"},{"v":"Rosemead + Passio Vehicle Positions"},{"v":"rec1RocbLnc4A9eE4"},{"v":"Rosemead Passio + Trip Updates"},{"v":"recQtNJAv4UyQjMh4"},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"c766db2bb5dd244b0e2176f4dfc35999"},{"v":"ad67eda06e383bbd5c174eab700915fb"},{"v":"70b85693df485b227c4a1a87ac46d065"},{"v":"1753401600000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"90ebcb3f9a649985be20bc10aa3ae3d4"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"Avalon + Schedule Historic"},{"v":"recdcpS2s3wuT3QAG"},{"v":"Avalon Alerts Historic"},{"v":"recgHXDRuyrKbOD47"},{"v":"Avalon + Vehicle Positions"},{"v":"recvwGOdqwtvPzKhZ"},{"v":"Avalon Trip Updates"},{"v":"recaDdOPYGgkoCOoT"},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"688fc08edb6dbba6ce92a11b36929993"},{"v":"66f7f7dfc9290a80d99b5b2e645d8659"},{"v":"b070831f700176fc88e2464cb353c24e"},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]}],"totalBytesProcessed":"1407","jobComplete":true,"cacheHit":false,"queryId":"job_PLfR612j0eM9NCuGF5MAADgfil7_","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"56","location":"us-west2","creationTime":"1762222199962","startTime":"1762222200044","endTime":"1762222200209"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:00 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_organizations`.`source_record_id` AS `organization_source_record_id`, + `dim_organizations`.`name` \nFROM `dim_organizations` \nWHERE `dim_organizations`.`_is_current` + = true", "maxResults": 5000, "requestId": "30e1cfd2-9327-4b74-8ec5-20ddcacdf507"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '519' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"name","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_PahzfUc8_KcBGIt1Nv9b95zd6wI4","location":"us-west2"},"totalRows":"4","rows":[{"f":[{"v":"reckp33bhAuZlmO1M"},{"v":"Santa + Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California"}]},{"f":[{"v":"recyqZ1zbZMkeA7Vf"},{"v":"City + of Patterson"}]},{"f":[{"v":"recOT4QO6t6mRhUEu"},{"v":"City of Chula Vista"}]},{"f":[{"v":"reckGS8egMZryjbX7"},{"v":"City + of Mission Viejo"}]}],"totalBytesProcessed":"272","jobComplete":true,"cacheHit":false,"queryId":"job_PahzfUc8_KcBGIt1Nv9b95zd6wI4","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"14","location":"us-west2","creationTime":"1762222201010","startTime":"1762222201126","endTime":"1762222201260"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:01 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "INT64"}, "parameterValue": {"value": "2"}, "name": "lpad_1"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "0"}, "name": "lpad_2"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": " - "}, "name": "concat_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "US/Pacific"}, + "name": "datetime_1"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "2025-10-16"}, "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "2025-10-16"}, "name": "datetime_4"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `bridge_organizations_x_headquarters_county_geography`.`organization_name`, + concat(lpad(CAST(`dim_county_geography`.`caltrans_district` AS STRING), @`lpad_1`, + @`lpad_2`), @`concat_1`, `dim_county_geography`.`caltrans_district_name`) AS + `caltrans_district`, `dim_county_geography`.`caltrans_district` AS `caltrans_district_1` + \nFROM `bridge_organizations_x_headquarters_county_geography` JOIN `dim_county_geography` + ON `dim_county_geography`.`key` = `bridge_organizations_x_headquarters_county_geography`.`county_geography_key` + \nWHERE datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_from`, + @`datetime_1`) <= datetime(@`datetime_2`) AND datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_to`, + @`datetime_3`) >= datetime(@`datetime_4`)", "maxResults": 5000, "requestId": + "647bb2ce-73e5-40c0-81b2-ab22f52e72de"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1836' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district_1","type":"INTEGER","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_zl4sw6nWzTDk3mQdx3ihnk24IWJR","location":"us-west2"},"totalRows":"1","rows":[{"f":[{"v":"Santa + Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California"},{"v":"05 + - San Luis Obispo / Santa Barbara"},{"v":"5"}]}],"totalBytesProcessed":"496","jobComplete":true,"cacheHit":false,"queryId":"job_zl4sw6nWzTDk3mQdx3ihnk24IWJR","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"20971520","totalSlotMs":"45","location":"us-west2","creationTime":"1762222201904","startTime":"1762222201975","endTime":"1762222202133"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:02 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk_subset_gtfs_dataset_cols.yaml b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk_subset_gtfs_dataset_cols.yaml new file mode 100644 index 000000000..11deb8831 --- /dev/null +++ b/_shared_utils/tests/shared_utils/cassettes/test_schedule_rt_utils/TestScheduleRtUtils.test_sample_gtfs_dataset_key_to_organization_crosswalk_subset_gtfs_dataset_cols.yaml @@ -0,0 +1,286 @@ +interactions: +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "ARRAY", "arrayType": {"type": "STRING"}}, "parameterValue": {"arrayValues": + [{"value": "schedule"}]}, "name": "type_1"}], "parameterMode": "NAMED", "useLegacySql": + false, "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", + "query": "SELECT `dim_gtfs_datasets`.`key` AS `gtfs_dataset_key`, `dim_gtfs_datasets`.`name` + AS `gtfs_dataset_name`, `dim_gtfs_datasets`.`source_record_id` \nFROM `dim_gtfs_datasets` + \nWHERE `dim_gtfs_datasets`.`data_quality_pipeline` = true AND `dim_gtfs_datasets`.`type` + IN UNNEST(@`type_1`)", "maxResults": 5000, "requestId": "4ad53049-c5c4-4de2-976d-098818c8aa0a"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '791' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"source_record_id","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_KPiy6Hsqt6PebBTuY3ly7Hl-lnY4","location":"us-west2"},"totalRows":"2","rows":[{"f":[{"v":"c51dbfdd47838f86074c4ef3179cc9ed"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"recuWhPXfxMatv6rL"}]},{"f":[{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"recuWhPXfxMatv6rL"}]}],"totalBytesProcessed":"274","jobComplete":true,"cacheHit":false,"queryId":"job_KPiy6Hsqt6PebBTuY3ly7Hl-lnY4","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"10485760","totalSlotMs":"29","location":"us-west2","creationTime":"1762222202826","startTime":"1762222202875","endTime":"1762222202999"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:03 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "STRING"}, "parameterValue": {"value": "US/Pacific"}, "name": "datetime_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "2025-10-16"}, + "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "2025-10-16"}, "name": "datetime_4"}], "parameterMode": + "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": true}, + "location": "us-west2", "query": "SELECT DISTINCT `dim_provider_gtfs_data`.`key`, + `dim_provider_gtfs_data`.`public_customer_facing_fixed_route`, `dim_provider_gtfs_data`.`public_customer_facing_or_regional_subfeed_fixed_route`, + `dim_provider_gtfs_data`.`organization_key`, `dim_provider_gtfs_data`.`organization_name`, + `dim_provider_gtfs_data`.`organization_itp_id`, `dim_provider_gtfs_data`.`organization_hubspot_company_record_id`, + `dim_provider_gtfs_data`.`organization_ntd_id`, `dim_provider_gtfs_data`.`organization_source_record_id`, + `dim_provider_gtfs_data`.`service_key`, `dim_provider_gtfs_data`.`service_name`, + `dim_provider_gtfs_data`.`service_source_record_id`, `dim_provider_gtfs_data`.`gtfs_service_data_customer_facing`, + `dim_provider_gtfs_data`.`regional_feed_type`, `dim_provider_gtfs_data`.`associated_schedule_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_name`, `dim_provider_gtfs_data`.`schedule_source_record_id`, + `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_name`, `dim_provider_gtfs_data`.`service_alerts_source_record_id`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_name`, `dim_provider_gtfs_data`.`vehicle_positions_source_record_id`, + `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_name`, `dim_provider_gtfs_data`.`trip_updates_source_record_id`, + `dim_provider_gtfs_data`.`schedule_gtfs_dataset_key`, `dim_provider_gtfs_data`.`service_alerts_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`vehicle_positions_gtfs_dataset_key`, `dim_provider_gtfs_data`.`trip_updates_gtfs_dataset_key`, + `dim_provider_gtfs_data`.`_valid_from`, `dim_provider_gtfs_data`.`_valid_to`, + `dim_provider_gtfs_data`.`_is_current` \nFROM `dim_provider_gtfs_data` \nWHERE + datetime(`dim_provider_gtfs_data`.`_valid_from`, @`datetime_1`) <= datetime(@`datetime_2`) + AND datetime(`dim_provider_gtfs_data`.`_valid_to`, @`datetime_3`) >= datetime(@`datetime_4`)", + "maxResults": 5000, "requestId": "f808da29-6929-424b-aaaf-44b6e83325b7"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '2638' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"key","type":"STRING","mode":"NULLABLE"},{"name":"public_customer_facing_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"public_customer_facing_or_regional_subfeed_fixed_route","type":"BOOLEAN","mode":"NULLABLE"},{"name":"organization_key","type":"STRING","mode":"NULLABLE"},{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"organization_itp_id","type":"INTEGER","mode":"NULLABLE"},{"name":"organization_hubspot_company_record_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_ntd_id","type":"STRING","mode":"NULLABLE"},{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_key","type":"STRING","mode":"NULLABLE"},{"name":"service_name","type":"STRING","mode":"NULLABLE"},{"name":"service_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"gtfs_service_data_customer_facing","type":"BOOLEAN","mode":"NULLABLE"},{"name":"regional_feed_type","type":"STRING","mode":"NULLABLE"},{"name":"associated_schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"schedule_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_name","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"schedule_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"service_alerts_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"vehicle_positions_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"trip_updates_gtfs_dataset_key","type":"STRING","mode":"NULLABLE"},{"name":"_valid_from","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_valid_to","type":"TIMESTAMP","mode":"NULLABLE"},{"name":"_is_current","type":"BOOLEAN","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_AQZpU_pJMrlYcZT4z7BY-op0WMyt","location":"us-west2"},"totalRows":"4","rows":[{"f":[{"v":"57321774eb04c7c650ef4709fb25a2c8"},{"v":"true"},{"v":"true"},{"v":"624c361fd9a2a9ad100cbf4b729c2df9"},{"v":"City + of Solvang"},{"v":"312"},{"v":"8719942682"},{"v":"9R02-91028"},{"v":"reckp33bhAuZlmO1M"},{"v":"55e1099a8610904b4479be54b36875d9"},{"v":"Santa + Ynez Valley Transit"},{"v":"recTlfGUkpnZ8cICb"},{"v":"true"},{"v":null},{"v":"bb227dec4b51d9d438125e38d0214784"},{"v":"Santa + Ynez Mecatran Schedule"},{"v":"recuWhPXfxMatv6rL"},{"v":null},{"v":null},{"v":"Santa + Ynez Vehicle Positions"},{"v":"reccgBgsKC5J7BD8K"},{"v":null},{"v":null},{"v":"372a06b593e1716d1c911b1d1d35bedd"},{"v":null},{"v":"40e0488e732e680cc9d7002fad71adad"},{"v":null},{"v":"1760572800000000"},{"v":"1761264000000000"},{"v":"true"}]},{"f":[{"v":"5beb8a6bf200cfdf00da66ac1521d526"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":"Ridgecrest + Schedule"},{"v":"recb7v2QiflfKM77w"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"1ba3e20e8edb9c64866206d138aaca83"},{"v":null},{"v":null},{"v":null},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"145279bf2bb891a6f43abd24403a46d7"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"Rosemead + Passio Schedule"},{"v":"reccLvZxTMz1FuNii"},{"v":"Rosemead Passio Alerts"},{"v":"recm7XryAyH4sP0ZF"},{"v":"Rosemead + Passio Vehicle Positions"},{"v":"rec1RocbLnc4A9eE4"},{"v":"Rosemead Passio + Trip Updates"},{"v":"recQtNJAv4UyQjMh4"},{"v":"b6280b24ea50c714c38f0c02bd8d3a2e"},{"v":"c766db2bb5dd244b0e2176f4dfc35999"},{"v":"ad67eda06e383bbd5c174eab700915fb"},{"v":"70b85693df485b227c4a1a87ac46d065"},{"v":"1753401600000000"},{"v":"4070908799999999"},{"v":"true"}]},{"f":[{"v":"90ebcb3f9a649985be20bc10aa3ae3d4"},{"v":"false"},{"v":"false"},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":null},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"Avalon + Schedule Historic"},{"v":"recdcpS2s3wuT3QAG"},{"v":"Avalon Alerts Historic"},{"v":"recgHXDRuyrKbOD47"},{"v":"Avalon + Vehicle Positions"},{"v":"recvwGOdqwtvPzKhZ"},{"v":"Avalon Trip Updates"},{"v":"recaDdOPYGgkoCOoT"},{"v":"794534a42bc6cbb1f4f7bc4d4bb1be16"},{"v":"688fc08edb6dbba6ce92a11b36929993"},{"v":"66f7f7dfc9290a80d99b5b2e645d8659"},{"v":"b070831f700176fc88e2464cb353c24e"},{"v":"1749859200000000"},{"v":"4070908799999999"},{"v":"true"}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_AQZpU_pJMrlYcZT4z7BY-op0WMyt","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762222203561","startTime":"1762222203673","endTime":"1762222203739"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:03 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [], "useLegacySql": false, + "formatOptions": {"useInt64Timestamp": true}, "location": "us-west2", "query": + "SELECT `dim_organizations`.`source_record_id` AS `organization_source_record_id`, + `dim_organizations`.`name` \nFROM `dim_organizations` \nWHERE `dim_organizations`.`_is_current` + = true", "maxResults": 5000, "requestId": "2fb96e6c-0ed8-4623-8d1c-e57c50682edf"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '519' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_source_record_id","type":"STRING","mode":"NULLABLE"},{"name":"name","type":"STRING","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_kkCDlZxqgRvw-yupX4MBu3qq5DyL","location":"us-west2"},"totalRows":"4","rows":[{"f":[{"v":"reckp33bhAuZlmO1M"},{"v":"Santa + Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California"}]},{"f":[{"v":"recyqZ1zbZMkeA7Vf"},{"v":"City + of Patterson"}]},{"f":[{"v":"recOT4QO6t6mRhUEu"},{"v":"City of Chula Vista"}]},{"f":[{"v":"reckGS8egMZryjbX7"},{"v":"City + of Mission Viejo"}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_kkCDlZxqgRvw-yupX4MBu3qq5DyL","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762222204432","startTime":"1762222204484","endTime":"1762222204543"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:04 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +- request: + body: '{"maximumBytesBilled": "5000000000", "defaultDataset": {"projectId": "cal-itp-data-infra-staging", + "datasetId": "test_shared_utils"}, "queryParameters": [{"parameterType": {"type": + "INT64"}, "parameterValue": {"value": "2"}, "name": "lpad_1"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "0"}, "name": "lpad_2"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": " - "}, "name": "concat_1"}, + {"parameterType": {"type": "STRING"}, "parameterValue": {"value": "US/Pacific"}, + "name": "datetime_1"}, {"parameterType": {"type": "STRING"}, "parameterValue": + {"value": "2025-10-16"}, "name": "datetime_2"}, {"parameterType": {"type": "STRING"}, + "parameterValue": {"value": "US/Pacific"}, "name": "datetime_3"}, {"parameterType": + {"type": "STRING"}, "parameterValue": {"value": "2025-10-16"}, "name": "datetime_4"}], + "parameterMode": "NAMED", "useLegacySql": false, "formatOptions": {"useInt64Timestamp": + true}, "location": "us-west2", "query": "SELECT `bridge_organizations_x_headquarters_county_geography`.`organization_name`, + concat(lpad(CAST(`dim_county_geography`.`caltrans_district` AS STRING), @`lpad_1`, + @`lpad_2`), @`concat_1`, `dim_county_geography`.`caltrans_district_name`) AS + `caltrans_district`, `dim_county_geography`.`caltrans_district` AS `caltrans_district_1` + \nFROM `bridge_organizations_x_headquarters_county_geography` JOIN `dim_county_geography` + ON `dim_county_geography`.`key` = `bridge_organizations_x_headquarters_county_geography`.`county_geography_key` + \nWHERE datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_from`, + @`datetime_1`) <= datetime(@`datetime_2`) AND datetime(`bridge_organizations_x_headquarters_county_geography`.`_valid_to`, + @`datetime_3`) >= datetime(@`datetime_4`)", "maxResults": 5000, "requestId": + "ffa2a1c5-7a5d-4658-bcf5-d0c6a7951505"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip + Authorization: + - FILTERED + Connection: + - keep-alive + Content-Length: + - '1836' + Content-Type: + - application/json + User-Agent: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + X-Goog-API-Client: + - sqlalchemy/1.4.46 gl-python/3.11.13 grpc/1.76.0 gax/2.27.0 gapic/3.38.0 gccl/3.38.0 + x-goog-user-project: + - cal-itp-data-infra-staging + method: POST + uri: https://bigquery.googleapis.com/bigquery/v2/projects/cal-itp-data-infra-staging/queries?prettyPrint=false + response: + body: + string: '{"kind":"bigquery#queryResponse","schema":{"fields":[{"name":"organization_name","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district","type":"STRING","mode":"NULLABLE"},{"name":"caltrans_district_1","type":"INTEGER","mode":"NULLABLE"}]},"jobReference":{"projectId":"cal-itp-data-infra-staging","jobId":"job_srrtA0IksWjgeKI3I5Fh2r7a3kvp","location":"us-west2"},"totalRows":"1","rows":[{"f":[{"v":"Santa + Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California"},{"v":"05 + - San Luis Obispo / Santa Barbara"},{"v":"5"}]}],"totalBytesProcessed":"0","jobComplete":true,"cacheHit":true,"queryId":"job_srrtA0IksWjgeKI3I5Fh2r7a3kvp","jobCreationReason":{"code":"REQUESTED"},"totalBytesBilled":"0","location":"us-west2","creationTime":"1762222205177","startTime":"1762222205235","endTime":"1762222205301"}' + headers: + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Content-Encoding: + - gzip + Content-Type: + - application/json; charset=UTF-8 + Date: + - Tue, 04 Nov 2025 02:10:05 GMT + Server: + - ESF + Transfer-Encoding: + - chunked + Vary: + - Origin + - X-Origin + - Referer + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-XSS-Protection: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/_shared_utils/tests/shared_utils/test_catalog_utils.py b/_shared_utils/tests/shared_utils/test_catalog_utils.py index c77f8828d..239778419 100644 --- a/_shared_utils/tests/shared_utils/test_catalog_utils.py +++ b/_shared_utils/tests/shared_utils/test_catalog_utils.py @@ -7,7 +7,7 @@ class TestCatalogUtils: @pytest.fixture() - def setup(self, mocker, tmp_path: Path) -> Callable: + def setup(self, tmp_path: Path) -> Callable: def setup_with(nested_directory: str = "", in_home_directory: bool = True) -> list[Path]: home_path = tmp_path.joinpath("home") home_path.mkdir() @@ -24,10 +24,7 @@ def setup_with(nested_directory: str = "", in_home_directory: bool = True) -> li shared_utils_path = repo_path.joinpath("_shared_utils/shared_utils") shared_utils_path.mkdir(parents=True) - mocker.patch("shared_utils.catalog_utils.Path.home", return_value=home_path) - mocker.patch("shared_utils.catalog_utils.Path.cwd", return_value=current_path) - - return [current_path, shared_utils_path] + return [home_path, current_path, shared_utils_path] return setup_with @@ -35,36 +32,36 @@ def test_get_catalog_file_nonexistent( self, setup: Callable, ) -> None: - setup() + home_path, current_path, _ = setup() with pytest.raises(FileNotFoundError, match="No such catalog file found"): - catalog_utils.get_catalog_file("test-file") + catalog_utils.get_catalog_file("test-file", home_path, current_path) def test_get_catalog_file_repo_in_home_directory( self, setup: Callable, ) -> None: - _, shared_utils_path = setup() + home_path, current_path, shared_utils_path = setup() shared_utils_path.joinpath("test-file.yml").touch() - filename = catalog_utils.get_catalog_file("test-file") + filename = catalog_utils.get_catalog_file("test-file", home_path, current_path) assert filename == shared_utils_path.joinpath("test-file.yml") def test_get_catalog_file_repo_nested_in_home_directory( self, setup: Callable, ) -> None: - _, shared_utils_path = setup("caltrans/") + home_path, current_path, shared_utils_path = setup("caltrans/") shared_utils_path.joinpath("test-file.yml").touch() - filename = catalog_utils.get_catalog_file("test-file") + filename = catalog_utils.get_catalog_file("test-file", home_path, current_path) assert filename == shared_utils_path.joinpath("test-file.yml") def test_get_catalog_file_repo_outside_of_home_directory( self, setup: Callable, ) -> None: - setup(in_home_directory=False) + home_path, current_path, _ = setup(in_home_directory=False) with pytest.raises(RuntimeError, match="The data-analyses repo should be located in your home directory."): - catalog_utils.get_catalog_file("test-file") + catalog_utils.get_catalog_file("test-file", home_path, current_path) diff --git a/_shared_utils/tests/shared_utils/test_schedule_rt_utils.py b/_shared_utils/tests/shared_utils/test_schedule_rt_utils.py new file mode 100644 index 000000000..91bbd91a0 --- /dev/null +++ b/_shared_utils/tests/shared_utils/test_schedule_rt_utils.py @@ -0,0 +1,317 @@ +import pandas as pd +import pytest +from pytest_unordered import unordered +from shared_utils.schedule_rt_utils import ( + filter_dim_county_geography, + filter_dim_gtfs_datasets, + filter_dim_organizations, + get_organization_id, + get_schedule_gtfs_dataset_key, + sample_gtfs_dataset_key_to_organization_crosswalk, +) + + +class TestScheduleRtUtils: + @pytest.fixture + def project(self): + return "cal-itp-data-infra-staging" + + @pytest.fixture + def dataset(self): + return "test_shared_utils" + + @pytest.mark.vcr + def test_get_schedule_gtfs_dataset_key(self, project: str, dataset: str): + result = get_schedule_gtfs_dataset_key(project=project, dataset=dataset, date="2025-09-01") + + assert len(result) == 1 + assert result.gtfs_dataset_key.values[0] == "0089bd1b0a2b78a8590d8749737d7146" + assert result.feed_key.values[0] == "10f6bb537140b7e52c8f313f3d611f71" + + @pytest.mark.vcr + def test_filter_dim_gtfs_datasets(self, project: str, dataset: str): + result = filter_dim_gtfs_datasets(project=project, dataset=dataset) + + assert len(result) == 3 + assert result.to_dict(orient="records") == unordered( + [ + { + "gtfs_dataset_key": "c51dbfdd47838f86074c4ef3179cc9ed", + "gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + "type": "schedule", + "regional_feed_type": None, + "uri": "http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0", + "base64_url": "aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA==", + }, + { + "gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + "type": "schedule", + "regional_feed_type": None, + "uri": "http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0", + "base64_url": "aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA==", + }, + { + "gtfs_dataset_key": "8aed0709366badf9342e03d0a2d72b8d", + "gtfs_dataset_name": "SLO Trip Updates", + "type": "trip_updates", + "regional_feed_type": None, + "uri": "http://data.peaktransit.com/gtfsrt/1/TripUpdate.pb", + "base64_url": "aHR0cDovL2RhdGEucGVha3RyYW5zaXQuY29tL2d0ZnNydC8xL1RyaXBVcGRhdGUucGI=", + }, + ] + ) + + @pytest.mark.vcr + def test_filter_dim_gtfs_datasets_keep_cols_subset(self, project: str, dataset: str): + result = filter_dim_gtfs_datasets(keep_cols=["key", "name"], project=project, dataset=dataset) + + assert len(result) == 3 + assert result.to_dict(orient="records") == unordered( + [ + { + "gtfs_dataset_key": "c51dbfdd47838f86074c4ef3179cc9ed", + "gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + }, + { + "gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + }, + {"gtfs_dataset_key": "8aed0709366badf9342e03d0a2d72b8d", "gtfs_dataset_name": "SLO Trip Updates"}, + ] + ) + + @pytest.mark.vcr + def test_filter_dim_gtfs_datasets_custom_filtering(self, project: str, dataset: str): + result = filter_dim_gtfs_datasets(custom_filtering={"type": ["trip_updates"]}, project=project, dataset=dataset) + + assert len(result) == 1 + assert result.to_dict(orient="records") == unordered( + [ + { + "gtfs_dataset_key": "8aed0709366badf9342e03d0a2d72b8d", + "gtfs_dataset_name": "SLO Trip Updates", + "type": "trip_updates", + "regional_feed_type": None, + "uri": "http://data.peaktransit.com/gtfsrt/1/TripUpdate.pb", + "base64_url": "aHR0cDovL2RhdGEucGVha3RyYW5zaXQuY29tL2d0ZnNydC8xL1RyaXBVcGRhdGUucGI=", + } + ] + ) + + def test_filter_dim_gtfs_datasets_keep_cols_key_missing(self, project: str, dataset: str): + with pytest.raises(KeyError, match="Include key in keep_cols list"): + filter_dim_gtfs_datasets( + keep_cols=["name", "type", "regional_feed_type", "uri", "base64_url"], project=project, dataset=dataset + ) + + @pytest.mark.vcr + def test_get_organization_id_no_merge_cols(self, project: str, dataset: str): + dataframe = pd.DataFrame( + data=[ + { + "feed_key": "bc76f45fb4d8a3c1be8349ad3d085c3c", + "schedule_gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "schedule_gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + "type": "schedule", + "schedule_source_record_id": "recuWhPXfxMatv6rL", + "regional_feed_type": None, + "base64_url": "anything", + "uri": "http://www.example.com", + } + ] + ) + + with pytest.raises(IndexError, match="list index out of range"): + get_organization_id(df=dataframe, date="2025-10-12", project=project, dataset=dataset) + + def test_get_organization_id_invalid_merge_cols(self, project: str, dataset: str): + with pytest.raises(KeyError, match="Unable to detect which GTFS quartet"): + get_organization_id( + df=pd.DataFrame(), date="2025-10-12", merge_cols=["notreal"], project=project, dataset=dataset + ) + + @pytest.mark.vcr + def test_filter_dim_county_geography(self, project: str, dataset: str): + result = filter_dim_county_geography(project=project, dataset=dataset, date="2025-06-17") + + assert len(result) == 2 + assert result.to_dict(orient="records") == unordered( + [ + { + "organization_name": "City of Rosemead", + "caltrans_district": "07 - Los Angeles / Ventura", + }, + {"organization_name": "Via / Remix Inc.", "caltrans_district": "07 - Los Angeles / Ventura"}, + ] + ) + + @pytest.mark.vcr + def test_filter_dim_county_geography_date_unavailable(self, project: str, dataset: str): + result = filter_dim_county_geography(project=project, dataset=dataset, date="2024-01-17") + + assert len(result) == 0 + + @pytest.mark.vcr + def test_filter_dim_county_geography_additional_keep_cols(self, project: str, dataset: str): + result = filter_dim_county_geography( + project=project, + dataset=dataset, + date="2025-06-17", + keep_cols=["caltrans_district", "county_geography_name", "msa", "fips"], + ) + + assert len(result) == 2 + assert result.to_dict(orient="records") == unordered( + [ + { + "organization_name": "City of Rosemead", + "caltrans_district": "07 - Los Angeles / Ventura", + "county_geography_name": "Los Angeles", + "msa": "Los Angeles-Long Beach-Anaheim", + "fips": 6037, + }, + { + "organization_name": "Via / Remix Inc.", + "caltrans_district": "07 - Los Angeles / Ventura", + "county_geography_name": "Los Angeles", + "msa": "Los Angeles-Long Beach-Anaheim", + "fips": 6037, + }, + ] + ) + + @pytest.mark.vcr + def test_filter_dim_organizations(self, project: str, dataset: str): + result = filter_dim_organizations(project=project, dataset=dataset) + + assert len(result) == 3 + assert result.to_dict(orient="records") == unordered( + [ + {"organization_source_record_id": "reckGS8egMZryjbX7"}, + {"organization_source_record_id": "recyqZ1zbZMkeA7Vf"}, + {"organization_source_record_id": "recOT4QO6t6mRhUEu"}, + ] + ) + assert "coolplace33333bX7" not in result.organization_source_record_id.values + + @pytest.mark.vcr + def test_filter_dim_organizations_additional_keep_cols(self, project: str, dataset: str): + result = filter_dim_organizations( + project=project, dataset=dataset, keep_cols=["key", "name", "organization_type"] + ) + + assert len(result) == 3 + assert result.to_dict(orient="records") == unordered( + [ + { + "key": "35448956533b3ff4f8c9cf4e7886c974", + "name": "City of Mission Viejo", + "organization_type": "City/Town", + }, + { + "key": "02a0e06b1ddb80e5695fc82fcc0c3ccc", + "name": "City of Patterson", + "organization_type": "City/Town", + }, + { + "key": "4cb90bb76f9cd9472a2df6dd9014b4fa", + "name": "City of Chula Vista", + "organization_type": "City/Town", + }, + ] + ) + + @pytest.mark.vcr + def test_filter_dim_organizations_custom_filtering(self, project: str, dataset: str): + result = filter_dim_organizations( + custom_filtering={"name": ["City of Mission Viejo", "City of Patterson"]}, + project=project, + keep_cols=["name", "source_record_id"], + dataset=dataset, + ) + + assert len(result) == 2 + assert result.to_dict(orient="records") == unordered( + [ + {"name": "City of Mission Viejo", "organization_source_record_id": "reckGS8egMZryjbX7"}, + {"name": "City of Patterson", "organization_source_record_id": "recyqZ1zbZMkeA7Vf"}, + ] + ) + + @pytest.mark.vcr + def test_sample_gtfs_dataset_key_to_organization_crosswalk(self, project: str, dataset: str): + dataframe = pd.DataFrame( + data=[ + { + "gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "feed_key": "bc76f45fb4d8a3c1be8349ad3d085c3c", + "name": "Santa Ynez Mecatran Schedule", + } + ] + ) + + result = sample_gtfs_dataset_key_to_organization_crosswalk( + df=dataframe, date="2025-10-16", quartet_data="schedule", project=project, dataset=dataset + ) + + assert len(result) == 1 + assert result.to_dict(orient="records") == unordered( + [ + { + "feed_key": "bc76f45fb4d8a3c1be8349ad3d085c3c", + "schedule_gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "schedule_gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + "type": "schedule", + "schedule_source_record_id": "recuWhPXfxMatv6rL", + "regional_feed_type": None, + "base64_url": "aHR0cDovL2FwcC5tZWNhdHJhbi5jb20vdXJiL3dzL2ZlZWQvYzJsMFpUMXplWFowTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDAwTWpjd056UTBaVFk0TlRBek9UTXlNREl4TURkak56STBNRFJrTXpZeU5UTTRNekkwWXpJMA==", + "uri": "http://app.mecatran.com/urb/ws/feed/c2l0ZT1zeXZ0O2NsaWVudD1zZWxmO2V4cGlyZT07dHlwZT1ndGZzO2tleT00MjcwNzQ0ZTY4NTAzOTMyMDIxMDdjNzI0MDRkMzYyNTM4MzI0YzI0", + "organization_source_record_id": "reckp33bhAuZlmO1M", + "organization_name": "Santa Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California", + "caltrans_district": "05 - San Luis Obispo / Santa Barbara", + } + ] + ) + + @pytest.mark.vcr + def test_sample_gtfs_dataset_key_to_organization_crosswalk_subset_gtfs_dataset_cols( + self, project: str, dataset: str + ): + dataframe = pd.DataFrame( + data=[ + { + "gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "feed_key": "bc76f45fb4d8a3c1be8349ad3d085c3c", + "name": "Santa Ynez Mecatran Schedule", + } + ] + ) + + result = sample_gtfs_dataset_key_to_organization_crosswalk( + df=dataframe, + date="2025-10-16", + quartet_data="schedule", + dim_gtfs_dataset_cols=[ + "key", + "name", + "source_record_id", + ], + project=project, + dataset=dataset, + ) + + assert len(result) == 1 + assert result.to_dict(orient="records") == unordered( + [ + { + "feed_key": "bc76f45fb4d8a3c1be8349ad3d085c3c", + "schedule_gtfs_dataset_key": "372a06b593e1716d1c911b1d1d35bedd", + "schedule_gtfs_dataset_name": "Santa Ynez Mecatran Schedule", + "schedule_source_record_id": "recuWhPXfxMatv6rL", + "organization_source_record_id": "reckp33bhAuZlmO1M", + "organization_name": "Santa Ynez Band of Chumash Mission Indians of the Santa Ynez Reservation, California", + "caltrans_district": "05 - San Luis Obispo / Santa Barbara", + } + ] + ) diff --git a/ca_transit_speed_maps/update_vars_index.py b/ca_transit_speed_maps/update_vars_index.py index b947690a7..5cb90c919 100644 --- a/ca_transit_speed_maps/update_vars_index.py +++ b/ca_transit_speed_maps/update_vars_index.py @@ -3,7 +3,7 @@ import datetime as dt import yaml -from shared_utils import rt_dates, catalog_utils, schedule_rt_utils +from shared_utils import rt_dates, catalog_utils from segment_speed_utils.project_vars import ( COMPILED_CACHED_VIEWS, diff --git a/gtfs_digest/19_airtable_portfolio_name.ipynb b/gtfs_digest/19_airtable_portfolio_name.ipynb index 60a58aceb..50c281521 100644 --- a/gtfs_digest/19_airtable_portfolio_name.ipynb +++ b/gtfs_digest/19_airtable_portfolio_name.ipynb @@ -24,7 +24,7 @@ "from calitp_data_analysis.tables import tbls\n", "db_engine = get_engine()\n", "from segment_speed_utils import helpers\n", - "from shared_utils import catalog_utils, gtfs_utils_v2, schedule_rt_utils\n", + "from shared_utils import gtfs_utils_v2\n", "from update_vars import GTFS_DATA_DICT, SCHED_GCS\n", "\n", "db_engine = get_engine()" diff --git a/gtfs_funnel/09_test_debug_concatenate_vp.ipynb b/gtfs_funnel/09_test_debug_concatenate_vp.ipynb index b638d7e67..423ccefb2 100644 --- a/gtfs_funnel/09_test_debug_concatenate_vp.ipynb +++ b/gtfs_funnel/09_test_debug_concatenate_vp.ipynb @@ -45,7 +45,6 @@ "from dask import delayed, compute\n", "from loguru import logger\n", "\n", - "from shared_utils import schedule_rt_utils\n", "from calitp_data_analysis import utils\n", "from update_vars import GTFS_DATA_DICT, SEGMENT_GCS\n", "\n", diff --git a/gtfs_schedule/06_Amanda_route_identification_over_time.ipynb b/gtfs_schedule/06_Amanda_route_identification_over_time.ipynb index bd1a71b1d..e051e9ab1 100644 --- a/gtfs_schedule/06_Amanda_route_identification_over_time.ipynb +++ b/gtfs_schedule/06_Amanda_route_identification_over_time.ipynb @@ -41,7 +41,6 @@ "from calitp_data_analysis import utils\n", "from loguru import logger\n", "\n", - "from shared_utils import schedule_rt_utils\n", "from shared_utils import geography_utils\n", "\n" ] diff --git a/gtfs_schedule/07_route_variance_identification.ipynb b/gtfs_schedule/07_route_variance_identification.ipynb index 088ad365a..543cf9342 100644 --- a/gtfs_schedule/07_route_variance_identification.ipynb +++ b/gtfs_schedule/07_route_variance_identification.ipynb @@ -75,7 +75,6 @@ "from loguru import logger\n", "from siuba import *\n", "\n", - "from shared_utils import schedule_rt_utils\n", "from shared_utils import geography_utils\n", "\n", "#rt_segment_speeds/segment_speed_utils\n", diff --git a/open_data/open_data_utils.py b/open_data/open_data_utils.py index 85e4480ed..e566c7f4e 100644 --- a/open_data/open_data_utils.py +++ b/open_data/open_data_utils.py @@ -8,7 +8,6 @@ import yaml from calitp_data_analysis import geography_utils -from shared_utils import gtfs_utils_v2, schedule_rt_utils, portfolio_utils from update_vars import TRAFFIC_OPS_GCS, analysis_date, GTFS_DATA_DICT, SCHED_GCS catalog = intake.open_catalog( diff --git a/rt_scheduled_v_ran/03_metrics_all_ops.ipynb b/rt_scheduled_v_ran/03_metrics_all_ops.ipynb index dffca70eb..94d2c57bd 100644 --- a/rt_scheduled_v_ran/03_metrics_all_ops.ipynb +++ b/rt_scheduled_v_ran/03_metrics_all_ops.ipynb @@ -33,7 +33,6 @@ "from calitp_data_analysis import utils\n", "\n", "# cd rt_segment_speeds && pip install -r requirements.txt && cd\n", - "from shared_utils import portfolio_utils, schedule_rt_utils\n", "from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs" ] },