Skip to content

[CH] Add schema updater script and workflow to run it #6863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .github/workflows/check_clickhouse_schemas.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Check ClickHouse Schemas

on:
schedule:
# Every day
- cron: "0 0 * * *"
workflow_dispatch:
pull_request:
paths:
- .github/workflows/check_clickhouse_schemas.yml
- .github/scripts/check_clickhouse_schemas.py
- clickhouse_db_schema/**

permissions:
contents: write

jobs:
check-clickhouse-schemas:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Check ClickHouse schemas
run: |
python3 -m pip install clickhouse-connect==0.8.18
cd tools
python3 -m torchci.clickhouse_database_schema_updater
env:
CLICKHOUSE_ENDPOINT: ${{ secrets.CLICKHOUSE_HUD_USER_URL }}
CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }}
CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_HUD_USER_PASSWORD }}

- name: If there are changes, error
run: |
if [[ -n "$(git status --porcelain)" ]]; then
echo "ClickHouse schemas are out of date. "\
"Please run the update script (see previous step for the commands) and commit the changes. "\
"If you are making a PR and the error is unrelated to your changes, you can ignore this error and merge the PR anyway. "\
"If you want to exclude a certain file from this check, edit the exclusion list in the script."
exit 1
else
echo "ClickHouse schemas are up to date."
fi
19 changes: 14 additions & 5 deletions clickhouse_db_schema/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# clickhouse db schema
Table schemas used to create tables and materialized view tables in clickhouse.
# ClickHouse Table Schemas
Table schemas used to create tables and materialized view tables in ClickHouse.

## Add new table
Currently we do not have automation to upstream the table schema to clickhouse.
Currently we do not have automation to upstream or downstream the table schema
to ClickHouse. These are not synced or completely representative of what is in
ClickHouse right now and should mainly be used as reference.

There is a script `tools/torchci/clickhouse_database_schema_updater.py` that
makes it easier to update and add schemas for tracking.

## Add new table
Please follow [How-to-add-a-new-custom-table-on-ClickHouse](https://github.com/pytorch/test-infra/wiki/How-to-add-a-new-custom-table-on-ClickHouse).

In order to create table or grant the permissions/roles in clickhouse, please reach out @clee2000 or @huydhn.
In order to create table or grant the permissions/roles in ClickHouse, please reach out @clee2000 or @huydhn.

Page maintainers: @pytorch/pytorch-dev-infra
<br>
Last verified: 2025-06-24
238 changes: 238 additions & 0 deletions clickhouse_db_schema/default.workflow_run/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
CREATE TABLE default.workflow_run
(
`actor` Tuple(
avatar_url String,
events_url String,
followers_url String,
following_url String,
gists_url String,
gravatar_id String,
html_url String,
id Int64,
login String,
node_id String,
organizations_url String,
received_events_url String,
repos_url String,
site_admin Bool,
starred_url String,
subscriptions_url String,
type String,
url String),
`artifacts_url` String,
`cancel_url` String,
`check_suite_id` Int64,
`check_suite_node_id` String,
`check_suite_url` String,
`conclusion` String,
`created_at` DateTime64(9),
`display_title` String,
`dynamoKey` String,
`event` String,
`head_branch` String,
`head_commit` Tuple(
author Tuple(
email String,
name String),
committer Tuple(
email String,
name String),
id String,
message String,
timestamp DateTime64(9),
tree_id String) COMMENT 'Contains commit info including id (SHA) that matches workflow_job.head_sha',
`head_repository` Tuple(
archive_url String,
assignees_url String,
blobs_url String,
branches_url String,
collaborators_url String,
comments_url String,
commits_url String,
compare_url String,
contents_url String,
contributors_url String,
deployments_url String,
description String,
downloads_url String,
events_url String,
fork Bool,
forks_url String,
full_name String,
git_commits_url String,
git_refs_url String,
git_tags_url String,
hooks_url String,
html_url String,
id Int64,
issue_comment_url String,
issue_events_url String,
issues_url String,
keys_url String,
labels_url String,
languages_url String,
merges_url String,
milestones_url String,
name String,
node_id String,
notifications_url String,
owner Tuple(
avatar_url String,
events_url String,
followers_url String,
following_url String,
gists_url String,
gravatar_id String,
html_url String,
id Int64,
login String,
node_id String,
organizations_url String,
received_events_url String,
repos_url String,
site_admin Bool,
starred_url String,
subscriptions_url String,
type String,
url String),
private Bool,
pulls_url String,
releases_url String,
stargazers_url String,
statuses_url String,
subscribers_url String,
subscription_url String,
tags_url String,
teams_url String,
trees_url String,
url String),
`head_sha` String,
`html_url` String,
`id` Int64,
`jobs_url` String,
`logs_url` String,
`name` String,
`node_id` String,
`path` String,
`previous_attempt_url` String,
`pull_requests` Array(Tuple(
base Tuple(
ref String,
repo Tuple(
id Int64,
name String,
url String),
sha String),
head Tuple(
ref String,
repo Tuple(
id Int64,
name String,
url String),
sha String),
id Int64,
number Int64,
url String)),
`referenced_workflows` Array(Tuple(
path String,
ref String,
sha String)),
`repository` Tuple(
archive_url String,
assignees_url String,
blobs_url String,
branches_url String,
collaborators_url String,
comments_url String,
commits_url String,
compare_url String,
contents_url String,
contributors_url String,
deployments_url String,
description String,
downloads_url String,
events_url String,
fork Bool,
forks_url String,
full_name String,
git_commits_url String,
git_refs_url String,
git_tags_url String,
hooks_url String,
html_url String,
id Int64,
issue_comment_url String,
issue_events_url String,
issues_url String,
keys_url String,
labels_url String,
languages_url String,
merges_url String,
milestones_url String,
name String,
node_id String,
notifications_url String,
owner Tuple(
avatar_url String,
events_url String,
followers_url String,
following_url String,
gists_url String,
gravatar_id String,
html_url String,
id Int64,
login String,
node_id String,
organizations_url String,
received_events_url String,
repos_url String,
site_admin Bool,
starred_url String,
subscriptions_url String,
type String,
url String),
private Bool,
pulls_url String,
releases_url String,
stargazers_url String,
statuses_url String,
subscribers_url String,
subscription_url String,
tags_url String,
teams_url String,
trees_url String,
url String),
`rerun_url` String,
`run_attempt` Int64,
`run_number` Int64,
`run_started_at` DateTime64(9),
`status` String,
`triggering_actor` Tuple(
avatar_url String,
events_url String,
followers_url String,
following_url String,
gists_url String,
gravatar_id String,
html_url String,
id Int64,
login String,
node_id String,
organizations_url String,
received_events_url String,
repos_url String,
site_admin Bool,
starred_url String,
subscriptions_url String,
type String,
url String),
`updated_at` DateTime64(9),
`url` String,
`workflow_id` Int64,
`workflow_url` String,
`_inserted_at` DateTime MATERIALIZED now(),
INDEX status_index status TYPE bloom_filter GRANULARITY 1
)
ENGINE = SharedReplacingMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
ORDER BY (id, dynamoKey)
SETTINGS index_granularity = 8192
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
CREATE TABLE misc.oss_ci_cur(
CREATE TABLE misc.oss_ci_cur
(
`created` DateTime64(0, 'UTC'),
`time` DateTime64(0, 'UTC'),
`type` String,
Expand All @@ -7,16 +8,10 @@ CREATE TABLE misc.oss_ci_cur(
`usage_type` String,
`unit` String,
`value` Float64,
`extra_info` Map(String,String),
`extra_info` Map(String, String),
`tags` Array(String) DEFAULT []
)
ENGINE = ReplacingMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
ENGINE = SharedReplacingMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PARTITION BY toYYYYMM(time)
ORDER BY (
type,
time,
granularity,
instance_type,
usage_type
)
ORDER BY (type, time, granularity, instance_type, usage_type)
SETTINGS index_granularity = 8192
22 changes: 22 additions & 0 deletions clickhouse_db_schema/misc.oss_ci_queue_time_histogram/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
CREATE TABLE misc.oss_ci_queue_time_histogram
(
`created_time` DateTime64(0, 'UTC'),
`histogram_version` String,
`type` String,
`repo` String,
`time` DateTime64(0, 'UTC'),
`workflow_name` String,
`job_name` String,
`machine_type` String,
`histogram` Array(UInt64),
`total_count` UInt64,
`max_queue_time` UInt64,
`avg_queue_time` UInt64,
`runner_labels` Array(String),
`extra_info` Map(String, String)
)
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PARTITION BY toYYYYMM(time)
ORDER BY (type, repo, time, machine_type, job_name, workflow_name)
TTL toDate(time) + toIntervalYear(5)
SETTINGS index_granularity = 8192
23 changes: 11 additions & 12 deletions clickhouse_db_schema/misc.torchagent_feedback/schema.sql
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
CREATE TABLE
misc.torchagent_feedback (
`user` String,
`session_id` String,
`history_key` String,
`feedback` Int8,
`time_inserted` DateTime64 (0, 'UTC')
) ENGINE = SharedMergeTree ('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PARTITION BY
toYYYYMM (time_inserted)
ORDER BY
(user, session_id, history_key, time_inserted) SETTINGS index_granularity = 8192
CREATE TABLE misc.torchagent_feedback
(
`user` String,
`session_id` String,
`feedback` Int8,
`time_inserted` DateTime64(0, 'UTC')
)
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PARTITION BY toYYYYMM(time_inserted)
ORDER BY (user, session_id, time_inserted)
SETTINGS index_granularity = 8192
Loading