Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: pytest

on: [push]

env:
PROJECT_ID: 'cal-itp-data-infra-staging'
WORKLOAD_IDENTITY_PROVIDER: 'projects/473674835135/locations/global/workloadIdentityPools/github-actions/providers/data-analyses'
SERVICE_ACCOUNT: 'github-actions-service-account@cal-itp-data-infra-staging.iam.gserviceaccount.com'

jobs:
test:
name: Run tests
runs-on: ubuntu-latest

permissions:
contents: read
id-token: write

steps:
- name: Checkout
uses: actions/checkout@v5

- name: Authenticate Google Service Account
uses: google-github-actions/auth@v2
with:
create_credentials_file: true
project_id: ${{ env.PROJECT_ID }}
workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ env.SERVICE_ACCOUNT }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install share_utils dependencies
working-directory: _shared_utils/
run: pip install -r requirements.txt

- name: Run shared_utils tests
working-directory: _shared_utils/
run: pytest tests
9 changes: 8 additions & 1 deletion _shared_utils/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
-e .
altair-transform==0.2.0
calitp-data-analysis==2025.8.10
great_tables==0.16.1
intake==0.6.4
numba (>=0.62.1, <0.63.0)
numpy (>=1.26.4, <2.0.0)
omegaconf==2.3.0 # better yaml configuration
polars==1.22.0
pytest (>=8.4.1, <9.0.0)
quarto-cli==1.6.40
pytest-mock (>=3.15.1, <4.0.0)
pytest-recording (>=0.13.4,<0.14.0)
pytest-unordered (>=0.7.0,<0.8.0)
quarto==0.1.0
quarto-cli==1.6.40
vegafusion==2.0.2
vl-convert-python>=1.6.0
57 changes: 31 additions & 26 deletions _shared_utils/shared_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
from . import (
arcgis_query,
catalog_utils,
dask_utils,
geo_utils,
gtfs_utils_v2,
portfolio_utils,
publish_utils,
rt_dates,
rt_utils,
schedule_rt_utils,
time_helpers,
)
import sys

__all__ = [
"arcgis_query",
"catalog_utils",
"dask_utils",
"geo_utils",
"gtfs_utils_v2",
"portfolio_utils",
"publish_utils",
"rt_dates",
"rt_utils",
"schedule_rt_utils",
"time_helpers",
]
if hasattr(sys, "_called_from_test"):
pass
else:
from . import (
arcgis_query,
catalog_utils,
dask_utils,
geo_utils,
gtfs_utils_v2,
portfolio_utils,
publish_utils,
rt_dates,
rt_utils,
schedule_rt_utils,
time_helpers,
)

__all__ = [
"arcgis_query",
"catalog_utils",
"dask_utils",
"geo_utils",
"gtfs_utils_v2",
"portfolio_utils",
"publish_utils",
"rt_dates",
"rt_utils",
"schedule_rt_utils",
"time_helpers",
]
8 changes: 4 additions & 4 deletions _shared_utils/shared_utils/catalog_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
shared_utils_directory = "data-analyses/_shared_utils/shared_utils/"


def get_catalog_file(catalog_name):
def get_catalog_file(catalog_name, home_path=Path.home(), current_path=Path.cwd()):
filename = f"{shared_utils_directory}{catalog_name}.yml"
parent_directory = Path.cwd()
parent_directory = current_path

if Path.home() not in Path.cwd().parents:
if home_path not in current_path.parents:
raise RuntimeError("The data-analyses repo should be located in your home directory.")

while True:
Expand All @@ -24,7 +24,7 @@ def get_catalog_file(catalog_name):
if test_path.is_file():
return test_path

if parent_directory == Path.home():
if parent_directory == home_path:
raise FileNotFoundError(f"No such catalog file found: {filename}")

parent_directory = parent_directory.parent
Expand Down
12 changes: 0 additions & 12 deletions _shared_utils/shared_utils/gtfs_utils_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,18 +443,6 @@ def hour_tuple_to_seconds(hour_tuple: tuple[int]) -> tuple[int]:
return (start_sec, end_sec)


def filter_start_end_ts(time_filters: dict, time_col: Literal["arrival", "departure"]) -> siuba.dply.verbs.Pipeable:
"""
For arrival or departure, grab the hours to subset and
convert the (start_hour, end_hour) tuple into seconds,
and return the siuba filter
"""
desired_hour_tuple = time_filters[time_col]
(start_sec, end_sec) = hour_tuple_to_seconds(desired_hour_tuple)

return filter(_[f"{time_col}_sec"] >= start_sec, _[f"{time_col}_sec"] <= end_sec)


def get_stop_times(
selected_date: Union[str, datetime.date],
operator_feeds: list[str] = [],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from sqlalchemy import Boolean, Column, DateTime, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class BridgeOrganizationsXHeadquartersCountyGeography(Base):
__tablename__ = "bridge_organizations_x_headquarters_county_geography"

organization_key = Column(String, primary_key=True)
county_geography_key = Column(String)
organization_name = Column(String)
county_geography_name = Column(String)
_valid_from = Column(DateTime)
_valid_to = Column(DateTime)
_is_current = Column(Boolean)
22 changes: 22 additions & 0 deletions _shared_utils/shared_utils/models/dim_county_geography.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from sqlalchemy import Boolean, Column, DateTime, Integer, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class DimCountyGeography(Base):
__tablename__ = "dim_county_geography"

key = Column(String, primary_key=True)
source_record_id = Column(String)
name = Column(String)
fips = Column(Integer)
msa = Column(String)
caltrans_district = Column(Integer)
caltrans_district_name = Column(String)
place_geography = Column(String)
organization_key = Column(String)
service_key = Column(String)
_is_current = Column(Boolean)
_valid_from = Column(DateTime)
_valid_to = Column(DateTime)
32 changes: 32 additions & 0 deletions _shared_utils/shared_utils/models/dim_gtfs_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from sqlalchemy import Boolean, Column, Date, DateTime, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class DimGtfsDataset(Base):
__tablename__ = "dim_gtfs_datasets"

key = Column(String, primary_key=True)
source_record_id = Column(String)
name = Column(String)
type = Column(String)
regional_feed_type = Column(String)
backdated_regional_feed_type = Column(String)
uri = Column(String)
future_uri = Column(String)
deprecated_date = Column(Date)
data_quality_pipeline = Column(Boolean)
manual_check__link_to_dataset_on_website = Column(String)
manual_check__accurate_shapes = Column(String)
manual_check__data_license = Column(String)
manual_check__authentication_acceptable = Column(String)
manual_check__stable_url = Column(String)
manual_check__localized_stop_tts = Column(String)
manual_check__grading_scheme_v1 = Column(String)
base64_url = Column(String)
private_dataset = Column(Boolean)
analysis_name = Column(String)
_is_current = Column(Boolean)
_valid_from = Column(DateTime)
_valid_to = Column(DateTime)
37 changes: 37 additions & 0 deletions _shared_utils/shared_utils/models/dim_organization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from sqlalchemy import Boolean, Column, DateTime, Integer, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class DimOrganization(Base):
__tablename__ = "dim_organizations"

key = Column(String, primary_key=True)
source_record_id = Column(String)
name = Column(String)
organization_type = Column(String)
roles = Column(String)
itp_id = Column(Integer)
details = Column(String)
website = Column(String)
reporting_category = Column(String)
hubspot_company_record_id = Column(String)
gtfs_static_status = Column(String)
gtfs_realtime_status = Column(String)
_deprecated__assessment_status = Column(Boolean)
manual_check__contact_on_website = Column(String)
alias = Column(String)
is_public_entity = Column(Boolean)
ntd_id = Column(String)
ntd_agency_info_key = Column(String)
ntd_id_2022 = Column(String)
rtpa_key = Column(String)
rtpa_name = Column(String)
mpo_key = Column(String)
mpo_name = Column(String)
public_currently_operating = Column(Boolean)
public_currently_operating_fixed_route = Column(Boolean)
_is_current = Column(Boolean)
_valid_from = Column(DateTime)
_valid_to = Column(DateTime)
39 changes: 39 additions & 0 deletions _shared_utils/shared_utils/models/dim_provider_gtfs_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from sqlalchemy import Boolean, Column, DateTime, Integer, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class DimProviderGtfsData(Base):
__tablename__ = "dim_provider_gtfs_data"

key = Column(String, primary_key=True)
public_customer_facing_fixed_route = Column(Boolean)
public_customer_facing_or_regional_subfeed_fixed_route = Column(Boolean)
organization_key = Column(String)
organization_name = Column(String)
organization_itp_id = Column(Integer)
organization_hubspot_company_record_id = Column(String)
organization_ntd_id = Column(String)
organization_source_record_id = Column(String)
service_key = Column(String)
service_name = Column(String)
service_source_record_id = Column(String)
gtfs_service_data_customer_facing = Column(Boolean)
regional_feed_type = Column(String)
associated_schedule_gtfs_dataset_key = Column(String)
schedule_gtfs_dataset_name = Column(String)
schedule_source_record_id = Column(String)
service_alerts_gtfs_dataset_name = Column(String)
service_alerts_source_record_id = Column(String)
vehicle_positions_gtfs_dataset_name = Column(String)
vehicle_positions_source_record_id = Column(String)
trip_updates_gtfs_dataset_name = Column(String)
trip_updates_source_record_id = Column(String)
schedule_gtfs_dataset_key = Column(String)
service_alerts_gtfs_dataset_key = Column(String)
vehicle_positions_gtfs_dataset_key = Column(String)
trip_updates_gtfs_dataset_key = Column(String)
_valid_from = Column(DateTime)
_valid_to = Column(DateTime)
_is_current = Column(Boolean)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from sqlalchemy import Boolean, Column, DateTime, Float, Integer, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()


class FctDailyFeedScheduledServiceSummary(Base):
__tablename__ = "fct_daily_feed_scheduled_service_summary"

service_date = Column(DateTime, primary_key=True)
feed_key = Column(String, primary_key=True)
gtfs_dataset_key = Column(String, primary_key=True)
ttl_service_hours = Column(Float)
n_trips = Column(Integer)
first_departure_sec = Column(Integer)
last_arrival_sec = Column(Integer)
num_stop_times = Column(Integer)
n_routes = Column(Integer)
contains_warning_duplicate_stop_times_primary_key = Column(Boolean)
contains_warning_duplicate_trip_primary_key = Column(Boolean)
contains_warning_missing_foreign_key_stop_id = Column(Boolean)
Loading
Loading