From ff6b33b60614b6ce584f2146e20f42a83f18debe Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 7 Aug 2025 14:32:58 -0700 Subject: [PATCH 01/60] Working prototype --- CODEOWNERS | 3 ++- filepush/.gitignore | 1 + filepush/REDME.md | 13 ++++++++++ .../databricks_template_schema.json | 23 +++++++++++++++++ .../filepush-template/library/variables.tmpl | 3 +++ .../template/__preamble.tmpl | 5 ++++ .../{{.connector_name}}/databricks.yml.tmpl | 25 +++++++++++++++++++ .../{{.connector_name}}/env.json.tmpl | 3 +++ .../{{.connector_name}}_job.yml.tmpl | 11 ++++++++ .../{{.connector_name}}_pipeline.yml.tmpl | 13 ++++++++++ .../{{.connector_name}}_volume.yml.tmpl | 9 +++++++ .../{{.connector_name}}_ingestion.py.tmpl | 13 ++++++++++ ...connector_name}}_readfiles_kernel.sql.tmpl | 6 +++++ 13 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 filepush/.gitignore create mode 100644 filepush/REDME.md create mode 100644 filepush/filepush-template/databricks_template_schema.json create mode 100644 filepush/filepush-template/library/variables.tmpl create mode 100644 filepush/filepush-template/template/__preamble.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl diff --git a/CODEOWNERS b/CODEOWNERS index d984c513..6350370d 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -11,6 +11,7 @@ conversational-agent-app @vivian-xie-db @yuanchaoma-db database-diagram-builder @alexott downstreams @nfx @alexott feature-registry-app @yang-chengg @mparkhe @mingyangge-db @stephanielu5 +filepush @chi-yang-db go-libs @nfx @alexott ip_access_list_analyzer @alexott ka-chat-bot @taiga-db @@ -19,4 +20,4 @@ runtime-packages @nfx @alexott sql_migration_copilot @robertwhiffin tacklebox @Jonathan-Choi uc-catalog-cloning @esiol-db @vasco-lopes -.github @nfx @alexott @gueniai \ No newline at end of file +.github @nfx @alexott @gueniai diff --git a/filepush/.gitignore b/filepush/.gitignore new file mode 100644 index 00000000..722d5e71 --- /dev/null +++ b/filepush/.gitignore @@ -0,0 +1 @@ +.vscode diff --git a/filepush/REDME.md b/filepush/REDME.md new file mode 100644 index 00000000..195287c9 --- /dev/null +++ b/filepush/REDME.md @@ -0,0 +1,13 @@ +--- +title: "Managed File Push" +language: python +author: "Chi Yang" +date: 2025-08-07 + +tags: +- ingestion +- file +- nocode +--- + +# Managed File Push diff --git a/filepush/filepush-template/databricks_template_schema.json b/filepush/filepush-template/databricks_template_schema.json new file mode 100644 index 00000000..6ce6f13b --- /dev/null +++ b/filepush/filepush-template/databricks_template_schema.json @@ -0,0 +1,23 @@ +{ + "properties": { + "connector_name": { + "type": "string", + "default": "filepushconnector", + "description": "Name of the filepush connector.", + "order": 1 + }, + "catalog_name": { + "type": "string", + "default": "{{default_catalog}}", + "description": "Name of the catalog where tables and pipelines will be created.", + "order": 2 + }, + "schema_name": { + "type": "string", + "default": "default", + "description": "Name of the schema where tables and pipelines will be created.", + "order": 3 + } + }, + "success_message": "\nYour bundle '{{.connector_name}}' has been created." +} diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl new file mode 100644 index 00000000..a8acc137 --- /dev/null +++ b/filepush/filepush-template/library/variables.tmpl @@ -0,0 +1,3 @@ +{{ define `volume_path` -}} + /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume +{{- end }} diff --git a/filepush/filepush-template/template/__preamble.tmpl b/filepush/filepush-template/template/__preamble.tmpl new file mode 100644 index 00000000..b538c75a --- /dev/null +++ b/filepush/filepush-template/template/__preamble.tmpl @@ -0,0 +1,5 @@ +# Preamble + +This file only template directives; it is skipped for the actual output. + +{{skip "__preamble"}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl new file mode 100644 index 00000000..50042472 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl @@ -0,0 +1,25 @@ +# databricks.yml +# This is the configuration for the Databricks Asset Bundle {{.connector_name}}. + +bundle: + name: {{.connector_name}} + +include: + - resources/*.yml + +targets: + # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html + dev: + mode: development + default: true + workspace: + host: {{workspace_host}} + + prod: + mode: production + workspace: + host: {{workspace_host}} + root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + permissions: + - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + level: CAN_MANAGE diff --git a/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl b/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl new file mode 100644 index 00000000..2f5917c4 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl @@ -0,0 +1,3 @@ +{ + "volume_path": "{{template `volume_path` .}}" +} diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl new file mode 100644 index 00000000..095518de --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl @@ -0,0 +1,11 @@ +# {{.connector_name}}_job.yml +# The main job for {{.connector_name}} + +resources: + jobs: + {{.connector_name}}_job: + name: {{.connector_name}}_job + tasks: + - task_key: {{.connector_name}}_pipeline_refresh + pipeline_task: + pipeline_id: ${resources.pipelines.{{.connector_name}}_pipeline.id} diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl new file mode 100644 index 00000000..4e45c1a7 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl @@ -0,0 +1,13 @@ +# {{.connector_name}}_pipeline.yml +# The refresh pipeline for {{.connector_name}} + +resources: + pipelines: + {{.connector_name}}_pipeline: + name: {{.connector_name}}_pipeline + catalog: {{.catalog_name}} + schema: {{.schema_name}} + serverless: true + libraries: + - file: + path: ../src/pipelines/{{.connector_name}}_ingestion.py diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl new file mode 100644 index 00000000..b3456934 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl @@ -0,0 +1,9 @@ +# {{.connector_name}}_volume.yml +# The volume for {{.connector_name}} + +resources: + volumes: + {{.connector_name}}_volume: + name: {{.connector_name}}_volume + catalog_name: {{.catalog_name}} + schema_name: {{.schema_name}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl new file mode 100644 index 00000000..a1af93f1 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl @@ -0,0 +1,13 @@ +import dlt + +@dlt.table( + name="{{.connector_name}}_raw", + comment="A streaming table created by filepush bundle {{.connector_name}}.", + table_properties={ + "volume_path": "{{template `volume_path` .}}" + } +) +def {{.connector_name}}_raw(): + with open(f"./{{.connector_name}}_readfiles_kernel.sql", "r") as f: + kernel_query = f.read() + return spark.sql(kernel_query) diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl new file mode 100644 index 00000000..a04774eb --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl @@ -0,0 +1,6 @@ +SELECT + * +FROM + read_files( + '{{template `volume_path` .}}' + ) From 02da2bf646c3cf71689e4f7003a164de12d6bf08 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 7 Aug 2025 16:02:26 -0700 Subject: [PATCH 02/60] Add basic tool scripts --- .../tools/get_push_endpoint_from_table.sh.tmpl | 2 ++ .../template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl | 2 ++ .../template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl create mode 100644 filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl new file mode 100644 index 00000000..30a953a3 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks tables get {{template `raw_table_name` .}} --output json | jq '.properties.volume_path' \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl new file mode 100644 index 00000000..4eb18a5a --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks pipelines start-update --pipeline-id ${resources.pipelines.{{.connector_name}}_pipeline.id} \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl new file mode 100644 index 00000000..9a52bde2 --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks fs cp $1 dbfs:{{template `volume_path` .}} \ No newline at end of file From a3a7683c00aabc5510bf9f2498aa17838479f91c Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 7 Aug 2025 16:02:51 -0700 Subject: [PATCH 03/60] Parameterize raw table name --- filepush/filepush-template/library/variables.tmpl | 4 ++++ .../src/pipelines/{{.connector_name}}_ingestion.py.tmpl | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index a8acc137..e15d2bf5 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -1,3 +1,7 @@ {{ define `volume_path` -}} /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume {{- end }} + +{{ define `raw_table_name` -}} + {{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_raw +{{- end}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl index a1af93f1..16c50973 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl @@ -1,13 +1,13 @@ import dlt @dlt.table( - name="{{.connector_name}}_raw", - comment="A streaming table created by filepush bundle {{.connector_name}}.", + name="{{template `raw_table_name` .}}", + comment="A streaming table created by filepush bundle {{.connector_name}}. This holds the raw data from the uploaded files.", table_properties={ "volume_path": "{{template `volume_path` .}}" } ) -def {{.connector_name}}_raw(): +def {{template `raw_table_name` .}}(): with open(f"./{{.connector_name}}_readfiles_kernel.sql", "r") as f: kernel_query = f.read() return spark.sql(kernel_query) From 1d79e2baefd7a5c9e42ec3d1b3a9907c7569ffb5 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 7 Aug 2025 16:52:11 -0700 Subject: [PATCH 04/60] Finish the dev scripts --- filepush/.gitignore | 3 +++ filepush/filepush-template/library/variables.tmpl | 2 +- .../template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl | 2 ++ .../tools/get_push_endpoint_from_table.sh.tmpl | 0 .../template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl | 2 +- .../{{.connector_name}}/tools/upload_to_volume.sh.tmpl | 0 6 files changed, 7 insertions(+), 2 deletions(-) create mode 100755 filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl mode change 100644 => 100755 filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl mode change 100644 => 100755 filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl mode change 100644 => 100755 filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl diff --git a/filepush/.gitignore b/filepush/.gitignore index 722d5e71..0e53a123 100644 --- a/filepush/.gitignore +++ b/filepush/.gitignore @@ -1 +1,4 @@ .vscode +up.sh +down.sh +conf.json diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index e15d2bf5..8bd4c960 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -3,5 +3,5 @@ {{- end }} {{ define `raw_table_name` -}} - {{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_raw + {{.connector_name}}_raw {{- end}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl new file mode 100755 index 00000000..da3fd63f --- /dev/null +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks bundle summary --output json | jq -r .resources.pipelines.{{.connector_name}}_pipeline.id \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl old mode 100644 new mode 100755 diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl old mode 100644 new mode 100755 index 4eb18a5a..87df798f --- a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks pipelines start-update --pipeline-id ${resources.pipelines.{{.connector_name}}_pipeline.id} \ No newline at end of file +databricks pipelines start-update $($(dirname $0)/get_pipeline_id.sh) \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl old mode 100644 new mode 100755 From 6f2b5a40cc3b9fa170cd51da25a5a5e7f1a9ead4 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 7 Aug 2025 17:13:43 -0700 Subject: [PATCH 05/60] Fix an identifier in script --- filepush/.gitignore | 1 + .../tools/get_push_endpoint_from_table.sh.tmpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/filepush/.gitignore b/filepush/.gitignore index 0e53a123..2a722220 100644 --- a/filepush/.gitignore +++ b/filepush/.gitignore @@ -2,3 +2,4 @@ up.sh down.sh conf.json +filepushconnector/ diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl index 30a953a3..7b6e488c 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks tables get {{template `raw_table_name` .}} --output json | jq '.properties.volume_path' \ No newline at end of file +databricks tables get {{.catalog_name}}.{{.schema_name}}.{{template `raw_table_name` .}} --output json | jq '.properties.volume_path' \ No newline at end of file From 37700e80c86a87f2fd5fbf6b6de19786da5557b3 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 8 Aug 2025 10:05:52 -0700 Subject: [PATCH 06/60] User file trigger --- filepush/filepush-template/library/variables.tmpl | 4 ++++ .../resources/{{.connector_name}}_job.yml.tmpl | 3 +++ .../{{.connector_name}}/tools/get_pipeline_id.sh.tmpl | 2 +- ...le.sh.tmpl => get_volume_path_from_table_property.sh.tmpl} | 2 +- .../{{.connector_name}}/tools/trigger_refresh.sh.tmpl | 2 +- .../{{.connector_name}}/tools/upload_to_volume.sh.tmpl | 2 +- 6 files changed, 11 insertions(+), 4 deletions(-) rename filepush/filepush-template/template/{{.connector_name}}/tools/{get_push_endpoint_from_table.sh.tmpl => get_volume_path_from_table_property.sh.tmpl} (96%) diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index 8bd4c960..0e642063 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -2,6 +2,10 @@ /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume {{- end }} +{{ define `volume_path_url` -}} + dbfs:{{template `volume_path` .}} +{{- end }} + {{ define `raw_table_name` -}} {{.connector_name}}_raw {{- end}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl index 095518de..4beb422c 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl @@ -9,3 +9,6 @@ resources: - task_key: {{.connector_name}}_pipeline_refresh pipeline_task: pipeline_id: ${resources.pipelines.{{.connector_name}}_pipeline.id} + trigger: + file_arrival: + url: {{template `volume_path_url` .}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl index da3fd63f..92645102 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks bundle summary --output json | jq -r .resources.pipelines.{{.connector_name}}_pipeline.id \ No newline at end of file +databricks bundle summary --output json | jq -r .resources.pipelines.{{.connector_name}}_pipeline.id diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl similarity index 96% rename from filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl rename to filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl index 7b6e488c..5ecf816f 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/get_push_endpoint_from_table.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks tables get {{.catalog_name}}.{{.schema_name}}.{{template `raw_table_name` .}} --output json | jq '.properties.volume_path' \ No newline at end of file +databricks tables get {{.catalog_name}}.{{.schema_name}}.{{template `raw_table_name` .}} --output json | jq '.properties.volume_path' diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl index 87df798f..7e297d6a 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks pipelines start-update $($(dirname $0)/get_pipeline_id.sh) \ No newline at end of file +databricks bundle run {{.connector_name}}_pipeline diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl index 9a52bde2..8eaad52e 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks fs cp $1 dbfs:{{template `volume_path` .}} \ No newline at end of file +databricks fs cp $1 {{template `volume_path_url` .}} From 2e0cbbf628ff092fc3bc7c6c2f37068b6c9e48c7 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 8 Aug 2025 10:20:09 -0700 Subject: [PATCH 07/60] Fix file trigger path --- filepush/filepush-template/library/variables.tmpl | 2 +- .../template/{{.connector_name}}/env.json.tmpl | 3 --- .../resources/{{.connector_name}}_job.yml.tmpl | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index 0e642063..073bc1ba 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -1,5 +1,5 @@ {{ define `volume_path` -}} - /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume + /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume/ {{- end }} {{ define `volume_path_url` -}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl b/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl deleted file mode 100644 index 2f5917c4..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/env.json.tmpl +++ /dev/null @@ -1,3 +0,0 @@ -{ - "volume_path": "{{template `volume_path` .}}" -} diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl index 4beb422c..cc416f49 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl @@ -11,4 +11,4 @@ resources: pipeline_id: ${resources.pipelines.{{.connector_name}}_pipeline.id} trigger: file_arrival: - url: {{template `volume_path_url` .}} + url: {{template `volume_path` .}} From 4fa17eb039d0a1400f9fb549c82a26fd9f0e2bd8 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 8 Aug 2025 14:39:22 -0700 Subject: [PATCH 08/60] Fix gitignore --- filepush/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/filepush/.gitignore b/filepush/.gitignore index 2a722220..0e53a123 100644 --- a/filepush/.gitignore +++ b/filepush/.gitignore @@ -2,4 +2,3 @@ up.sh down.sh conf.json -filepushconnector/ From 2a6062bd5fc85733e58651849b50e890bcaea23a Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 8 Aug 2025 14:55:50 -0700 Subject: [PATCH 09/60] Unpause file trigger by default --- .../resources/{{.connector_name}}_job.yml.tmpl | 1 + 1 file changed, 1 insertion(+) diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl index cc416f49..e31dde57 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl @@ -12,3 +12,4 @@ resources: trigger: file_arrival: url: {{template `volume_path` .}} + pause_status: UNPAUSED From 26ff855b46c3865da594e1bd51a0ae89008cd11f Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 11 Aug 2025 15:44:59 -0700 Subject: [PATCH 10/60] Switch to streaming query --- .../src/pipelines/{{.connector_name}}_ingestion.py.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl index 16c50973..ce8cbc94 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl @@ -9,5 +9,5 @@ import dlt ) def {{template `raw_table_name` .}}(): with open(f"./{{.connector_name}}_readfiles_kernel.sql", "r") as f: - kernel_query = f.read() + kernel_query = kernel_query.replace("read_files(", "STREAM read_files(") return spark.sql(kernel_query) From c5212cfe4bfa7fd047f3cf56853da2d668881919 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 12 Aug 2025 08:38:34 -0700 Subject: [PATCH 11/60] Successful dynamic table prototype --- .../filepush-template/library/variables.tmpl | 12 +++++ .../{{.connector_name}}_ingestion.py.tmpl | 54 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index 073bc1ba..35950f56 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -2,6 +2,18 @@ /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume/ {{- end }} +{{ define `volume_data_path` -}} + dbfs:{{template `volume_path` .}}data/ +{{- end }} + +{{ define `volume_baddata_path` -}} + dbfs:{{template `volume_path` .}}baddata/ +{{- end }} + +{{ define `volume_archive_path` -}} + dbfs:{{template `volume_path` .}}archive/ +{{- end }} + {{ define `volume_path_url` -}} dbfs:{{template `volume_path` .}} {{- end }} diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl index ce8cbc94..719eecc2 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl @@ -1,4 +1,6 @@ import dlt +from dbruntime.dbutils import FileInfo +import re @dlt.table( name="{{template `raw_table_name` .}}", @@ -9,5 +11,57 @@ import dlt ) def {{template `raw_table_name` .}}(): with open(f"./{{.connector_name}}_readfiles_kernel.sql", "r") as f: + kernel_query = f.read() kernel_query = kernel_query.replace("read_files(", "STREAM read_files(") return spark.sql(kernel_query) + +# Dynamic Tables +def sanitize_table_name(name: str) -> str: + """ + Make a valid, reasonably human-friendly table name from a folder name. + - Lowercase + - Replace non [a-z0-9_] with underscores + - Ensure it doesn't start with a digit + """ + n = name.strip().lower() + n = re.sub(r"[^a-z0-9_]", "_", n) + if re.match(r"^[0-9]", n): + n = f"t_{n}" + n = re.sub(r"_+", "_", n).strip("_") + return n or "t_unnamed" + +def dbfs_is_dir(f: FileInfo): + is_dir_attr = getattr(f, "isDir", None) + return is_dir_attr() if callable(is_dir_attr) else f.name.endswith("/") + +def list_immediate_subdirs(path: str): + items = dbutils.fs.ls(path) + out = [] + for f in items: + if dbfs_is_dir(f): + # f.name often ends with '/', drop it for a clean folder name + clean_name = f.name[:-1] if f.name.endswith("/") else f.name + out.append((clean_name, f.path.removeprefix('dbfs:'))) + return out + +def make_dlt_table(subdir_name: str, subdir_path: str): + """ + Defines a DLT table for a given subfolder at import time. + Uses Auto Loader (streaming) if `streaming=True`, else batch reader. + """ + + table_name = sanitize_table_name(subdir_name) + + if len(dbutils.fs.ls(subdir_path)) > 0: + @dlt.table( + name=table_name, + comment=f"Auto-created from subfolder: {subdir_path} (streaming via Auto Loader)", + table_properties={ + "volume_path": f"{subdir_path}" + } + ) + def _auto_loader_table(): + return spark.readStream.format("cloudFiles").option("cloudFiles.format","csv").load(subdir_path) + +for subdir_name, subdir_path in list_immediate_subdirs('{{template `volume_path` .}}'): + make_dlt_table(subdir_name, subdir_path) \ No newline at end of file From efb8e4aa620b4a02c0d443c3c4f6ce88c0e7a570 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 12 Aug 2025 10:19:37 -0700 Subject: [PATCH 12/60] Eliminate connector name --- .../databricks_template_schema.json | 8 +------- filepush/filepush-template/library/variables.tmpl | 4 ++-- .../resources/{{.connector_name}}_job.yml.tmpl | 15 --------------- .../{{.connector_name}}_pipeline.yml.tmpl | 13 ------------- .../resources/{{.connector_name}}_volume.yml.tmpl | 9 --------- .../get_volume_path_from_table_property.sh.tmpl | 2 -- .../tools/trigger_refresh.sh.tmpl | 2 -- .../tools/upload_to_volume.sh.tmpl | 2 -- .../databricks.yml.tmpl | 4 ++-- .../{{.schema_name}}/resources/job.yml.tmpl | 15 +++++++++++++++ .../{{.schema_name}}/resources/pipeline.yml.tmpl | 14 ++++++++++++++ .../{{.schema_name}}/resources/schema.yml.tmpl | 7 +++++++ .../{{.schema_name}}/resources/volume.yml.tmpl | 8 ++++++++ .../pipelines/{{.schema_name}}_ingestion.py.tmpl} | 13 ------------- .../{{.schema_name}}_readfiles_kernel.sql.tmpl} | 0 .../tools/get_pipeline_id.sh.tmpl | 2 +- .../get_volume_path_from_table_property.sh.tmpl | 2 ++ .../tools/trigger_refresh.sh.tmpl | 2 ++ .../tools/upload_to_volume.sh.tmpl | 2 ++ 19 files changed, 56 insertions(+), 68 deletions(-) delete mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl delete mode 100755 filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl delete mode 100755 filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl delete mode 100755 filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl rename filepush/filepush-template/template/{{{.connector_name}} => {{.schema_name}}}/databricks.yml.tmpl (83%) create mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl create mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl rename filepush/filepush-template/template/{{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl => {{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl} (77%) rename filepush/filepush-template/template/{{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl => {{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl} (100%) rename filepush/filepush-template/template/{{{.connector_name}} => {{.schema_name}}}/tools/get_pipeline_id.sh.tmpl (69%) create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl diff --git a/filepush/filepush-template/databricks_template_schema.json b/filepush/filepush-template/databricks_template_schema.json index 6ce6f13b..256f185a 100644 --- a/filepush/filepush-template/databricks_template_schema.json +++ b/filepush/filepush-template/databricks_template_schema.json @@ -1,11 +1,5 @@ { "properties": { - "connector_name": { - "type": "string", - "default": "filepushconnector", - "description": "Name of the filepush connector.", - "order": 1 - }, "catalog_name": { "type": "string", "default": "{{default_catalog}}", @@ -19,5 +13,5 @@ "order": 3 } }, - "success_message": "\nYour bundle '{{.connector_name}}' has been created." + "success_message": "\nYour file push bundle in {{.catalog_name}}.{{.schema_name}} has been created." } diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl index 35950f56..34b8f79e 100644 --- a/filepush/filepush-template/library/variables.tmpl +++ b/filepush/filepush-template/library/variables.tmpl @@ -1,5 +1,5 @@ {{ define `volume_path` -}} - /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.connector_name}}_volume/ + /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.schema_name}}_volume/ {{- end }} {{ define `volume_data_path` -}} @@ -18,6 +18,6 @@ dbfs:{{template `volume_path` .}} {{- end }} -{{ define `raw_table_name` -}} +{{ define `raw_table_name_format` -}} {{.connector_name}}_raw {{- end}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl deleted file mode 100644 index e31dde57..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_job.yml.tmpl +++ /dev/null @@ -1,15 +0,0 @@ -# {{.connector_name}}_job.yml -# The main job for {{.connector_name}} - -resources: - jobs: - {{.connector_name}}_job: - name: {{.connector_name}}_job - tasks: - - task_key: {{.connector_name}}_pipeline_refresh - pipeline_task: - pipeline_id: ${resources.pipelines.{{.connector_name}}_pipeline.id} - trigger: - file_arrival: - url: {{template `volume_path` .}} - pause_status: UNPAUSED diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl deleted file mode 100644 index 4e45c1a7..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_pipeline.yml.tmpl +++ /dev/null @@ -1,13 +0,0 @@ -# {{.connector_name}}_pipeline.yml -# The refresh pipeline for {{.connector_name}} - -resources: - pipelines: - {{.connector_name}}_pipeline: - name: {{.connector_name}}_pipeline - catalog: {{.catalog_name}} - schema: {{.schema_name}} - serverless: true - libraries: - - file: - path: ../src/pipelines/{{.connector_name}}_ingestion.py diff --git a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl b/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl deleted file mode 100644 index b3456934..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/resources/{{.connector_name}}_volume.yml.tmpl +++ /dev/null @@ -1,9 +0,0 @@ -# {{.connector_name}}_volume.yml -# The volume for {{.connector_name}} - -resources: - volumes: - {{.connector_name}}_volume: - name: {{.connector_name}}_volume - catalog_name: {{.catalog_name}} - schema_name: {{.schema_name}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl deleted file mode 100755 index 5ecf816f..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/get_volume_path_from_table_property.sh.tmpl +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -databricks tables get {{.catalog_name}}.{{.schema_name}}.{{template `raw_table_name` .}} --output json | jq '.properties.volume_path' diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl deleted file mode 100755 index 7e297d6a..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/trigger_refresh.sh.tmpl +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -databricks bundle run {{.connector_name}}_pipeline diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl deleted file mode 100755 index 8eaad52e..00000000 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/upload_to_volume.sh.tmpl +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -databricks fs cp $1 {{template `volume_path_url` .}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl similarity index 83% rename from filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl index 50042472..e1fe600b 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/databricks.yml.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl @@ -1,8 +1,8 @@ # databricks.yml -# This is the configuration for the Databricks Asset Bundle {{.connector_name}}. +# This is the configuration for the file push DAB {{.schema_name}}. bundle: - name: {{.connector_name}} + name: {{.schema_name}} include: - resources/*.yml diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl new file mode 100644 index 00000000..97d09b12 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl @@ -0,0 +1,15 @@ +# The main job for schema {{.schema_name}} +# This job will trigger in the schema pipeline + +resources: + jobs: + {{.schema_name}}_job: + name: {{.schema_name}}_job + tasks: + - task_key: {{.schema_name}}_pipeline_refresh + pipeline_task: + pipeline_id: ${resources.pipelines.{{.schema_name}}_pipeline.id} + trigger: + file_arrival: + url: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ + pause_status: UNPAUSED diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl new file mode 100644 index 00000000..7c609a38 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl @@ -0,0 +1,14 @@ +# The table refresh pipeline for schema {{.schema_name}} + +resources: + pipelines: + {{.schema_name}}_pipeline: + name: {{.schema_name}}_pipeline + catalog: {{.catalog_name}} + schema: {{.schema_name}} + serverless: true + libraries: + - file: + path: ../src/pipelines/{{.schema_name}}_ingestion.py + configuration: + volume_path: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl new file mode 100644 index 00000000..abfc1ba1 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl @@ -0,0 +1,7 @@ +# The schema {{.schema_name}} + +resources: + schemas: + {{.schema_name}}: + name: {{.schema_name}} + catalog_name: {{.catalog_name}} \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl new file mode 100644 index 00000000..ce3782bd --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl @@ -0,0 +1,8 @@ +# The file staging volume for schema{{.schema_name}} + +resources: + volumes: + {{.schema_name}}_volume: + name: {{.schema_name}}_volume + catalog_name: {{.catalog_name}} + schema_name: {{.schema_name}} diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl similarity index 77% rename from filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl index 719eecc2..05b7f44d 100644 --- a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl @@ -2,19 +2,6 @@ import dlt from dbruntime.dbutils import FileInfo import re -@dlt.table( - name="{{template `raw_table_name` .}}", - comment="A streaming table created by filepush bundle {{.connector_name}}. This holds the raw data from the uploaded files.", - table_properties={ - "volume_path": "{{template `volume_path` .}}" - } -) -def {{template `raw_table_name` .}}(): - with open(f"./{{.connector_name}}_readfiles_kernel.sql", "r") as f: - kernel_query = f.read() - kernel_query = kernel_query.replace("read_files(", "STREAM read_files(") - return spark.sql(kernel_query) - # Dynamic Tables def sanitize_table_name(name: str) -> str: """ diff --git a/filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl similarity index 100% rename from filepush/filepush-template/template/{{.connector_name}}/src/pipelines/{{.connector_name}}_readfiles_kernel.sql.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl diff --git a/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl similarity index 69% rename from filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl index 92645102..a0ffe5e3 100755 --- a/filepush/filepush-template/template/{{.connector_name}}/tools/get_pipeline_id.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl @@ -1,2 +1,2 @@ #!/usr/bin/env bash -databricks bundle summary --output json | jq -r .resources.pipelines.{{.connector_name}}_pipeline.id +databricks bundle summary --output json | jq -r .resources.pipelines.{{.schema_name}}_pipeline.id diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl new file mode 100755 index 00000000..7e2ac123 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks tables get {{.catalog_name}}.{{.schema_name}}.$1 --output json | jq '.properties.volume_path' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl new file mode 100755 index 00000000..2206e20f --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks bundle run {{.schema_name}}_pipeline diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl new file mode 100755 index 00000000..285d1ae1 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +databricks fs cp $2 {{template `volume_path_url` .}}/$1 From 44e4ce6b4d90518154b2e39e00f5a62dd6e2ddde Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 12 Aug 2025 10:47:11 -0700 Subject: [PATCH 13/60] Fix shell script --- .../template/{{.schema_name}}/tools/env.sh.tmpl | 14 ++++++++++++++ .../{{.schema_name}}/tools/get_pipeline_id.sh.tmpl | 2 -- .../get_volume_path_from_table_property.sh.tmpl | 3 ++- .../tools/upload_to_volume.sh.tmpl | 4 +++- 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl new file mode 100644 index 00000000..619cef3c --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Prevent running directly; this file must be *sourced* +(return 0 2>/dev/null) || { echo "Source this file: . $(basename "$0")"; exit 1; } + +# Idempotent guard +if [[ -n "${_FILEPUSH_ENV_LOADED:-}" ]]; then + return 0 +fi +export _FILEPUSH_ENV_LOADED=1 + +summary=$(databricks bundle summary --output json) +export FILEPUSH_CATALOG_NAME={{.catalog_name}} +export FILEPUSH_SCHEMA_NAME=$(echo $summary | jq -r '.resources.schemas.{{.schema_name}}.name') +export FILEPUSH_VOLUME_PATH=/Volumes/{{.catalog_name}}/${FILEPUSH_SCHEMA_NAME}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl deleted file mode 100755 index a0ffe5e3..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_pipeline_id.sh.tmpl +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -databricks bundle summary --output json | jq -r .resources.pipelines.{{.schema_name}}_pipeline.id diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl index 7e2ac123..744a80b0 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl @@ -1,2 +1,3 @@ #!/usr/bin/env bash -databricks tables get {{.catalog_name}}.{{.schema_name}}.$1 --output json | jq '.properties.volume_path' +. $(dirname $0)/env.sh +databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 --output json | jq '.properties.volume_path' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl index 285d1ae1..1ebd46d4 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl @@ -1,2 +1,4 @@ #!/usr/bin/env bash -databricks fs cp $2 {{template `volume_path_url` .}}/$1 +. $(dirname $0)/env.sh +databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ +databricks fs cp $2 dbfs:${FILEPUSH_VOLUME_PATH}$1/ From 9a9e93ba68f9372b11515b26b03538f360cdce36 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 12 Aug 2025 14:27:29 -0700 Subject: [PATCH 14/60] Working prototype for dynamic table kernel --- .../resources/volume.yml.tmpl | 4 +- .../{{.schema_name}}_ingestion.py.tmpl | 42 +++++++++++++++---- ...{{.schema_name}}_readfiles_kernel.sql.tmpl | 9 +++- .../tools/trigger_refresh.sh.tmpl | 1 + 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl index ce3782bd..95904249 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl @@ -1,8 +1,8 @@ -# The file staging volume for schema{{.schema_name}} +# The file staging volume for schema {{.schema_name}} resources: volumes: {{.schema_name}}_volume: name: {{.schema_name}}_volume catalog_name: {{.catalog_name}} - schema_name: {{.schema_name}} + schema_name: ${resources.schemas.{{.schema_name}}.name} diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl index 05b7f44d..ff22d361 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl @@ -1,11 +1,12 @@ import dlt from dbruntime.dbutils import FileInfo import re +import os # Dynamic Tables def sanitize_table_name(name: str) -> str: """ - Make a valid, reasonably human-friendly table name from a folder name. + Sanitize a table name. - Lowercase - Replace non [a-z0-9_] with underscores - Ensure it doesn't start with a digit @@ -17,6 +18,16 @@ def sanitize_table_name(name: str) -> str: n = re.sub(r"_+", "_", n).strip("_") return n or "t_unnamed" +def is_valid_table_name(name: str) -> bool: + """ + Validate a table name. + - Must be alphanumeric + - Must not start with a digit + - Must not contain any special characters + """ + pat = re.compile(r'^[A-Za-z0-9_]+$') + return pat.match(name) is not None + def dbfs_is_dir(f: FileInfo): is_dir_attr = getattr(f, "isDir", None) return is_dir_attr() if callable(is_dir_attr) else f.name.endswith("/") @@ -28,16 +39,27 @@ def list_immediate_subdirs(path: str): if dbfs_is_dir(f): # f.name often ends with '/', drop it for a clean folder name clean_name = f.name[:-1] if f.name.endswith("/") else f.name - out.append((clean_name, f.path.removeprefix('dbfs:'))) + if is_valid_table_name(clean_name): + out.append((clean_name, f.path.removeprefix('dbfs:'))) + else: + print(f"Skipping invalid table name: {clean_name}. It must be alphanumeric connected by underscores and not start with a digit.") return out def make_dlt_table(subdir_name: str, subdir_path: str): """ Defines a DLT table for a given subfolder at import time. - Uses Auto Loader (streaming) if `streaming=True`, else batch reader. + If table does not exist, it will create a read_files kernel and use that to create the table. """ - table_name = sanitize_table_name(subdir_name) + kernel_file_name = f"./{{.schema_name}}_{table_name}_readfiles_kernel.sql" + + if not os.path.exists(kernel_file_name): + print(f"Initialize table {table_name}") + with open(f"./{{.schema_name}}_readfiles_kernel.sql", "r") as f: + kernel_query_fmt = f.read() + with open(kernel_file_name, "w") as f: + table_kernel_query = kernel_query_fmt % subdir_path + f.write(table_kernel_query) if len(dbutils.fs.ls(subdir_path)) > 0: @dlt.table( @@ -48,7 +70,13 @@ def make_dlt_table(subdir_name: str, subdir_path: str): } ) def _auto_loader_table(): - return spark.readStream.format("cloudFiles").option("cloudFiles.format","csv").load(subdir_path) - -for subdir_name, subdir_path in list_immediate_subdirs('{{template `volume_path` .}}'): + with open(kernel_file_name, "r") as f: + table_kernel_query = f.read() + print(table_kernel_query.replace("read_files(", "STREAM read_files(")) + return spark.sql(table_kernel_query.replace("read_files(", "STREAM read_files(")) + else: + print(f"Waiting for files to land in {subdir_path}") + +volume_path_root = spark.conf.get("volume_path") +for subdir_name, subdir_path in list_immediate_subdirs(volume_path_root): make_dlt_table(subdir_name, subdir_path) \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl index a04774eb..bf4b68b5 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl @@ -1,6 +1,13 @@ +-- Kernel template for read_files SELECT * FROM read_files( - '{{template `volume_path` .}}' + '%s' + , + -- Do not change anything above + -- Add any additional options below + -- Example: + -- header => 'true', + -- escape => '"' ) diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl index 2206e20f..e279f57d 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl @@ -1,2 +1,3 @@ #!/usr/bin/env bash +. $(dirname $0)/env.sh databricks bundle run {{.schema_name}}_pipeline From 6eb2c6336a1077e3e3befd527889756f75b79da3 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 12 Aug 2025 15:06:27 -0700 Subject: [PATCH 15/60] Fix comma and trigger job instead --- filepush/filepush-template/databricks_template_schema.json | 2 +- .../src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl | 2 +- .../template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/filepush/filepush-template/databricks_template_schema.json b/filepush/filepush-template/databricks_template_schema.json index 256f185a..66159bdc 100644 --- a/filepush/filepush-template/databricks_template_schema.json +++ b/filepush/filepush-template/databricks_template_schema.json @@ -13,5 +13,5 @@ "order": 3 } }, - "success_message": "\nYour file push bundle in {{.catalog_name}}.{{.schema_name}} has been created." + "success_message": "\nYour file push bundle under catalog and schema {{.catalog_name}}.{{.schema_name}} has been created." } diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl index bf4b68b5..2fdcdc5e 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl @@ -4,10 +4,10 @@ SELECT FROM read_files( '%s' - , -- Do not change anything above -- Add any additional options below -- Example: + -- , -- header => 'true', -- escape => '"' ) diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl index e279f57d..70652af9 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl @@ -1,3 +1,3 @@ #!/usr/bin/env bash . $(dirname $0)/env.sh -databricks bundle run {{.schema_name}}_pipeline +databricks bundle run {{.schema_name}}_job From 69436f691d6d235b1eb334ccb9a5e4ed1e01e28e Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 13 Aug 2025 12:34:41 -0700 Subject: [PATCH 16/60] Update property to all resource --- .../template/{{.schema_name}}/resources/pipeline.yml.tmpl | 4 ++-- .../template/{{.schema_name}}/resources/schema.yml.tmpl | 4 +++- ...{.schema_name}}_ingestion.py.tmpl => ingestion.py.tmpl} | 4 ++-- .../template/{{.schema_name}}/tools/env.sh.tmpl | 7 +++++++ .../tools/get_volume_path_from_pipeline_config.sh | 3 +++ .../tools/get_volume_path_from_schema_dbproperty.sh | 3 +++ ...erty.sh.tmpl => get_volume_path_from_table_property.sh} | 2 +- .../tools/set_volume_path_to_schema_dbproperty.sh | 3 +++ .../tools/{trigger_refresh.sh.tmpl => trigger_refresh.sh} | 2 +- .../{upload_to_volume.sh.tmpl => upload_to_volume.sh} | 4 ++++ 10 files changed, 29 insertions(+), 7 deletions(-) rename filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{{.schema_name}}_ingestion.py.tmpl => ingestion.py.tmpl} (96%) create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh rename filepush/filepush-template/template/{{.schema_name}}/tools/{get_volume_path_from_table_property.sh.tmpl => get_volume_path_from_table_property.sh} (61%) create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh rename filepush/filepush-template/template/{{.schema_name}}/tools/{trigger_refresh.sh.tmpl => trigger_refresh.sh} (50%) rename filepush/filepush-template/template/{{.schema_name}}/tools/{upload_to_volume.sh.tmpl => upload_to_volume.sh} (60%) diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl index 7c609a38..21b124be 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl @@ -9,6 +9,6 @@ resources: serverless: true libraries: - file: - path: ../src/pipelines/{{.schema_name}}_ingestion.py + path: ../src/pipelines/ingestion.py configuration: - volume_path: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ + filepush.volume_path: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl index abfc1ba1..032b7b9d 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl @@ -4,4 +4,6 @@ resources: schemas: {{.schema_name}}: name: {{.schema_name}} - catalog_name: {{.catalog_name}} \ No newline at end of file + catalog_name: {{.catalog_name}} + properties: + filepush.volume_path: /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.schema_name}}_volume/ \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl similarity index 96% rename from filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl index ff22d361..9c837652 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_ingestion.py.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl @@ -66,7 +66,7 @@ def make_dlt_table(subdir_name: str, subdir_path: str): name=table_name, comment=f"Auto-created from subfolder: {subdir_path} (streaming via Auto Loader)", table_properties={ - "volume_path": f"{subdir_path}" + "filepush.volume_path": f"{subdir_path}" } ) def _auto_loader_table(): @@ -77,6 +77,6 @@ def make_dlt_table(subdir_name: str, subdir_path: str): else: print(f"Waiting for files to land in {subdir_path}") -volume_path_root = spark.conf.get("volume_path") +volume_path_root = spark.conf.get("filepush.volume_path") for subdir_name, subdir_path in list_immediate_subdirs(volume_path_root): make_dlt_table(subdir_name, subdir_path) \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl index 619cef3c..e9898794 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl @@ -1,8 +1,13 @@ #!/usr/bin/env bash +# This file is used to set the environment variables for the filepush bundle. +# It is sourced by the other scripts in the tools directory. +# This should be deployed **after** the bundle is deployed. + # Prevent running directly; this file must be *sourced* (return 0 2>/dev/null) || { echo "Source this file: . $(basename "$0")"; exit 1; } # Idempotent guard +# Check if the environment is already set and non-empty if [[ -n "${_FILEPUSH_ENV_LOADED:-}" ]]; then return 0 fi @@ -12,3 +17,5 @@ summary=$(databricks bundle summary --output json) export FILEPUSH_CATALOG_NAME={{.catalog_name}} export FILEPUSH_SCHEMA_NAME=$(echo $summary | jq -r '.resources.schemas.{{.schema_name}}.name') export FILEPUSH_VOLUME_PATH=/Volumes/{{.catalog_name}}/${FILEPUSH_SCHEMA_NAME}/{{.schema_name}}_volume/ +export FILEPUSH_PIPELINE_ID=$(echo $summary | jq -r '.resources.pipelines.{{.schema_name}}_pipeline.id') +export FILEPUSH_JOB_NAME={{.schema_name}}_job diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh new file mode 100755 index 00000000..9f618af4 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +. $(dirname $0)/env.sh +databricks pipelines get $FILEPUSH_PIPELINE_ID --output json | jq '.spec.configuration["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh new file mode 100755 index 00000000..952e9098 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +. $(dirname $0)/env.sh +databricks schemas get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh similarity index 61% rename from filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh index 744a80b0..df4b46c6 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh @@ -1,3 +1,3 @@ #!/usr/bin/env bash . $(dirname $0)/env.sh -databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 --output json | jq '.properties.volume_path' +databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh new file mode 100755 index 00000000..0996e96f --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +. $(dirname $0)/env.sh +databricks schemas update ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --json '{ "properties": { "filepush.volume_path": "'${FILEPUSH_VOLUME_PATH}'" } }' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh similarity index 50% rename from filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh index 70652af9..bb724b10 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh @@ -1,3 +1,3 @@ #!/usr/bin/env bash . $(dirname $0)/env.sh -databricks bundle run {{.schema_name}}_job +databricks bundle run $FILEPUSH_JOB_NAME diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh similarity index 60% rename from filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl rename to filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh index 1ebd46d4..ee926ddc 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash . $(dirname $0)/env.sh +if [ -z "$1" ] || [ -z "$2" ]; then + echo "Usage: $0 " + exit 1 +fi databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ databricks fs cp $2 dbfs:${FILEPUSH_VOLUME_PATH}$1/ From 28a28917085cf0377d2d1d1c88eb0405220bc7be Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 13 Aug 2025 15:21:21 -0700 Subject: [PATCH 17/60] Fix order number --- filepush/filepush-template/databricks_template_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/filepush/filepush-template/databricks_template_schema.json b/filepush/filepush-template/databricks_template_schema.json index 66159bdc..f150630b 100644 --- a/filepush/filepush-template/databricks_template_schema.json +++ b/filepush/filepush-template/databricks_template_schema.json @@ -4,13 +4,13 @@ "type": "string", "default": "{{default_catalog}}", "description": "Name of the catalog where tables and pipelines will be created.", - "order": 2 + "order": 1 }, "schema_name": { "type": "string", "default": "default", "description": "Name of the schema where tables and pipelines will be created.", - "order": 3 + "order": 2 } }, "success_message": "\nYour file push bundle under catalog and schema {{.catalog_name}}.{{.schema_name}} has been created." From c8e5d0eb165a443b342ec49680741741096dfe49 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 13 Aug 2025 16:50:13 -0700 Subject: [PATCH 18/60] Enable bundle target in tool scripts --- .../{{.schema_name}}/tools/env.sh.tmpl | 22 ++++++++++++++++++- .../get_volume_path_from_pipeline_config.sh | 6 ++++- .../get_volume_path_from_schema_dbproperty.sh | 6 ++++- .../get_volume_path_from_table_property.sh | 6 ++++- .../set_volume_path_to_schema_dbproperty.sh | 6 ++++- .../{{.schema_name}}/tools/trigger_refresh.sh | 6 ++++- .../tools/upload_to_volume.sh | 8 +++++-- 7 files changed, 52 insertions(+), 8 deletions(-) diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl index e9898794..74bea68a 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl @@ -13,7 +13,27 @@ if [[ -n "${_FILEPUSH_ENV_LOADED:-}" ]]; then fi export _FILEPUSH_ENV_LOADED=1 -summary=$(databricks bundle summary --output json) +# Sets the target for the bundle +ARG_TARGET="dev" +ARG_POSITIONAL=() + +while [[ $# -gt 0 ]]; do +case "$1" in + --target) [[ $# -ge 2 ]] || { echo "Error: --target needs a value"; return 2; } + ARG_TARGET="$2"; shift 2 ;; + --target=*) ARG_TARGET="${1#*=}"; shift ;; + -t) [[ $# -ge 2 ]] || { echo "Error: -t needs a value"; return 2; } + ARG_TARGET="$2"; shift 2 ;; + --) shift; ARG_POSITIONAL+=("$@"); break ;; + -h|--help) usage; return 1 ;; + -*) echo "Unknown option: $1"; usage; return 2 ;; + *) ARG_POSITIONAL+=("$1"); shift ;; +esac +done + +export BUNDLE_TARGET=$ARG_TARGET + +summary=$(databricks bundle summary -t $BUNDLE_TARGET --output json) export FILEPUSH_CATALOG_NAME={{.catalog_name}} export FILEPUSH_SCHEMA_NAME=$(echo $summary | jq -r '.resources.schemas.{{.schema_name}}.name') export FILEPUSH_VOLUME_PATH=/Volumes/{{.catalog_name}}/${FILEPUSH_SCHEMA_NAME}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh index 9f618af4..f163b33f 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ databricks pipelines get $FILEPUSH_PIPELINE_ID --output json | jq '.spec.configuration["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh index 952e9098..b81776b7 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ databricks schemas get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh index df4b46c6..704471be 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh index 0996e96f..69ffbc41 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ databricks schemas update ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --json '{ "properties": { "filepush.volume_path": "'${FILEPUSH_VOLUME_PATH}'" } }' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh index bb724b10..e2536dd8 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ databricks bundle run $FILEPUSH_JOB_NAME diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh index ee926ddc..2d7dc8ce 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh @@ -1,7 +1,11 @@ #!/usr/bin/env bash -. $(dirname $0)/env.sh +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ if [ -z "$1" ] || [ -z "$2" ]; then - echo "Usage: $0 " + usage exit 1 fi databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ From 54faae01073aae107a70b549e15ec4c71f9a3326 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 13 Aug 2025 17:06:10 -0700 Subject: [PATCH 19/60] Embed bundle target to all scripts --- .../tools/get_volume_path_from_pipeline_config.sh | 2 +- .../tools/get_volume_path_from_schema_dbproperty.sh | 2 +- .../tools/get_volume_path_from_table_property.sh | 2 +- .../tools/set_volume_path_to_schema_dbproperty.sh | 2 +- .../template/{{.schema_name}}/tools/trigger_refresh.sh | 2 +- .../template/{{.schema_name}}/tools/upload_to_volume.sh | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh index f163b33f..4fef63c6 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh @@ -4,4 +4,4 @@ usage() { } export -f usage . $(dirname $0)/env.sh $@ -databricks pipelines get $FILEPUSH_PIPELINE_ID --output json | jq '.spec.configuration["filepush.volume_path"]' +databricks pipelines get $FILEPUSH_PIPELINE_ID -t $BUNDLE_TARGET --output json | jq '.spec.configuration["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh index b81776b7..14da35c1 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh @@ -4,4 +4,4 @@ usage() { } export -f usage . $(dirname $0)/env.sh $@ -databricks schemas get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --output json | jq '.properties["filepush.volume_path"]' +databricks schemas get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} -t $BUNDLE_TARGET --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh index 704471be..fb93f468 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh @@ -4,4 +4,4 @@ usage() { } export -f usage . $(dirname $0)/env.sh $@ -databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 --output json | jq '.properties["filepush.volume_path"]' +databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 -t $BUNDLE_TARGET --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh index 69ffbc41..3169f1c0 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh @@ -4,4 +4,4 @@ usage() { } export -f usage . $(dirname $0)/env.sh $@ -databricks schemas update ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} --json '{ "properties": { "filepush.volume_path": "'${FILEPUSH_VOLUME_PATH}'" } }' +databricks schemas update ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} -t $BUNDLE_TARGET --json '{ "properties": { "filepush.volume_path": "'${FILEPUSH_VOLUME_PATH}'" } }' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh index e2536dd8..f85c34e1 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh @@ -4,4 +4,4 @@ usage() { } export -f usage . $(dirname $0)/env.sh $@ -databricks bundle run $FILEPUSH_JOB_NAME +databricks bundle run $FILEPUSH_JOB_NAME -t $BUNDLE_TARGET diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh index 2d7dc8ce..31e5fbde 100755 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh @@ -8,5 +8,5 @@ if [ -z "$1" ] || [ -z "$2" ]; then usage exit 1 fi -databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ -databricks fs cp $2 dbfs:${FILEPUSH_VOLUME_PATH}$1/ +databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ -t $BUNDLE_TARGET +databricks fs cp $2 dbfs:${FILEPUSH_VOLUME_PATH}$1/ -t $BUNDLE_TARGET From 96811b80a301310d220ba2b24fd0887dd8241cd1 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 14 Aug 2025 13:26:45 -0700 Subject: [PATCH 20/60] Added CRUD --- filepush/create_filepush_schema.sh | 18 +++++++++++++++ filepush/drop_filepush_schema.sh | 22 +++++++++++++++++++ .../{{.schema_name}}/databricks.yml.tmpl | 3 +++ filepush/push_file_to_table.sh | 15 +++++++++++++ 4 files changed, 58 insertions(+) create mode 100755 filepush/create_filepush_schema.sh create mode 100755 filepush/drop_filepush_schema.sh create mode 100755 filepush/push_file_to_table.sh diff --git a/filepush/create_filepush_schema.sh b/filepush/create_filepush_schema.sh new file mode 100755 index 00000000..3815470d --- /dev/null +++ b/filepush/create_filepush_schema.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +usage() { + echo "Usage: $(basename $0) " +} +if [ -z "$1" ] || [ -z "$2" ]; then + usage + exit 1 +fi +if ! databricks catalogs get "$1" >/dev/null 2>&1; then + echo "Catalog \`$1\` not found (or no permission)" + exit 1 +fi +databricks bundle init filepush-template --config-file <(echo "{\"catalog_name\": \"$1\", \"schema_name\": \"$2\"}") +working_dir=$(pwd) +schema_name=$2 +cd $schema_name +databricks bundle deploy --force-lock --auto-approve -t prod +cd $working_dir \ No newline at end of file diff --git a/filepush/drop_filepush_schema.sh b/filepush/drop_filepush_schema.sh new file mode 100755 index 00000000..adbd2521 --- /dev/null +++ b/filepush/drop_filepush_schema.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +usage() { + echo "Usage: $(basename $0) " +} +if [ -z "$1" ] || [ -z "$2" ]; then + usage + exit 1 +fi +if ! databricks catalogs get "$1" >/dev/null 2>&1; then + echo "Catalog \`$1\` not found (or no permission)" + exit 1 +fi +volume_path=$(databricks schemas get $1.$2 --output json | jq -r '.properties["filepush.volume_path"]') +if [ -z "$volume_path" ] || [ "$volume_path" == "null" ]; then + echo "Schema \`$1.$2\` is not a filepush schema. Did you run create_filepush_schema.sh to create it?" + exit 1 +fi +working_dir=$(pwd) +schema_name=$2 +cd $schema_name +databricks bundle destroy --force-lock -t prod +cd $working_dir \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl index e1fe600b..d32c1802 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl @@ -7,6 +7,9 @@ bundle: include: - resources/*.yml +experimental: + skip_name_prefix_for_schema: true + targets: # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html dev: diff --git a/filepush/push_file_to_table.sh b/filepush/push_file_to_table.sh new file mode 100755 index 00000000..e652b6b8 --- /dev/null +++ b/filepush/push_file_to_table.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +usage() { + echo "Usage: $(basename $0) " +} +if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ] || [ -z "$4" ]; then + usage + exit 1 +fi +volume_path=$(databricks schemas get $1.$2 --output json | jq -r '.properties["filepush.volume_path"]') +if [ -z "$volume_path" ] || [ "$volume_path" == "null" ]; then + echo "Schema \`$1.$2\` is not a filepush schema. Did you run create_filepush_schema.sh to create it?" + exit 1 +fi +databricks fs mkdir dbfs:${volume_path}$3/ +databricks fs cp $4 dbfs:${volume_path}$3/ \ No newline at end of file From fd73830b458989868c547463d31647ead23ce08e Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 15 Aug 2025 17:06:17 -0700 Subject: [PATCH 21/60] Add new default options to kernel --- .../src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl index 2fdcdc5e..55666737 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl @@ -3,7 +3,9 @@ SELECT * FROM read_files( - '%s' + '%s', + ignoreCorruptFiles => 'true', -- If a different malformed file pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. + ignoreMissingFiles => 'true' -- If a different file format is accidentally pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. -- Do not change anything above -- Add any additional options below -- Example: From 0409893e1479ff6b8f59b3d6283bedc81df69a4d Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 19 Aug 2025 17:47:32 -0700 Subject: [PATCH 22/60] Add helper script to open all resource --- .../template/{{.schema_name}}/tools/env.sh.tmpl | 1 + .../{{.schema_name}}/tools/open_all_resources.sh | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl index 74bea68a..a5a64d7c 100644 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl @@ -34,6 +34,7 @@ done export BUNDLE_TARGET=$ARG_TARGET summary=$(databricks bundle summary -t $BUNDLE_TARGET --output json) +export FILEPUSH_BUNDLE_NAME={{.schema_name}} export FILEPUSH_CATALOG_NAME={{.catalog_name}} export FILEPUSH_SCHEMA_NAME=$(echo $summary | jq -r '.resources.schemas.{{.schema_name}}.name') export FILEPUSH_VOLUME_PATH=/Volumes/{{.catalog_name}}/${FILEPUSH_SCHEMA_NAME}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh new file mode 100755 index 00000000..6abf8173 --- /dev/null +++ b/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +usage() { + echo "Usage: $(basename $0) [--target=dev|prod]" +} +export -f usage +. $(dirname $0)/env.sh $@ +databricks bundle open ${FILEPUSH_BUNDLE_NAME} -t $BUNDLE_TARGET +databricks bundle open ${FILEPUSH_BUNDLE_NAME}_job -t $BUNDLE_TARGET +databricks bundle open ${FILEPUSH_BUNDLE_NAME}_pipeline -t $BUNDLE_TARGET +databricks bundle open ${FILEPUSH_BUNDLE_NAME}_volume -t $BUNDLE_TARGET \ No newline at end of file From 644345e1209199f00dfd2b7625dfd7ffca335eca Mon Sep 17 00:00:00 2001 From: chi-yang-db <117940157+chi-yang-db@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:10:54 -0700 Subject: [PATCH 23/60] Before migrating to new CUJ From b0896381ec78f4bd79e97333d6147a71c716f150 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 11 Sep 2025 17:22:40 -0700 Subject: [PATCH 24/60] Successful deployment after dab conversion --- filepush/dab/databricks.yml | 36 +++++++++++++++++++ filepush/dab/resources/job.yml | 25 +++++++++++++ filepush/dab/resources/pipeline.yml | 12 +++++++ filepush/dab/resources/schema.yml | 7 ++++ filepush/dab/resources/volume.yml | 8 +++++ filepush/dab/src/initialization.py | 0 .../src/pipelines/dab_readfiles_kernel.sql | 15 ++++++++ filepush/dab/src/pipelines/ingestion.py | 0 8 files changed, 103 insertions(+) create mode 100644 filepush/dab/databricks.yml create mode 100644 filepush/dab/resources/job.yml create mode 100644 filepush/dab/resources/pipeline.yml create mode 100644 filepush/dab/resources/schema.yml create mode 100644 filepush/dab/resources/volume.yml create mode 100644 filepush/dab/src/initialization.py create mode 100644 filepush/dab/src/pipelines/dab_readfiles_kernel.sql create mode 100644 filepush/dab/src/pipelines/ingestion.py diff --git a/filepush/dab/databricks.yml b/filepush/dab/databricks.yml new file mode 100644 index 00000000..28003832 --- /dev/null +++ b/filepush/dab/databricks.yml @@ -0,0 +1,36 @@ +# databricks.yml +# This is the configuration for the file push DAB dab. + +bundle: + name: dab + +include: + - resources/*.yml + +# experimental: +# skip_name_prefix_for_schema: true + +targets: + # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html + dev: + mode: development + default: true + workspace: + host: https://e2-dogfood.staging.cloud.databricks.com + + prod: + mode: production + workspace: + host: https://e2-dogfood.staging.cloud.databricks.com + # root_path: /Workspace/Users/chi.yang@databricks.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: chi.yang@databricks.com + level: CAN_MANAGE + +variables: + catalog_name: + description: The existing catalog where the schema will be created. + default: main + schema_name: + description: The name of the schema where the tables and ingestion pipeline will be created. + default: filepushschema diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml new file mode 100644 index 00000000..08b13bb6 --- /dev/null +++ b/filepush/dab/resources/job.yml @@ -0,0 +1,25 @@ +# The main job for schema dab +# This job will trigger in the schema pipeline + +resources: + jobs: + filetrigger_job: + name: ${var.schema_name}_filetrigger_job + tasks: + - task_key: pipeline_refresh + pipeline_task: + pipeline_id: ${resources.pipelines.refresh_pipeline.id} + configuration_job: + name: ${var.schema_name}_configuration_job + tasks: + - task_key: initialization + spark_python_task: + python_file: ../src/initialization.py + environment_key: serverless + - task_key: trigger_refresh + run_job_task: + job_id: ${resources.jobs.filetrigger_job.id} + environments: + - environment_key: serverless + spec: + client: "3" diff --git a/filepush/dab/resources/pipeline.yml b/filepush/dab/resources/pipeline.yml new file mode 100644 index 00000000..f1dee3b1 --- /dev/null +++ b/filepush/dab/resources/pipeline.yml @@ -0,0 +1,12 @@ +# The table refresh pipeline for schema dab + +resources: + pipelines: + refresh_pipeline: + name: ${var.schema_name}_refresh_pipeline + catalog: ${var.catalog_name} + schema: ${var.schema_name} + serverless: true + libraries: + - file: + path: ../src/pipelines/ingestion.py diff --git a/filepush/dab/resources/schema.yml b/filepush/dab/resources/schema.yml new file mode 100644 index 00000000..28eae88a --- /dev/null +++ b/filepush/dab/resources/schema.yml @@ -0,0 +1,7 @@ +# The schema dab + +resources: + schemas: + main_schema: + name: ${var.schema_name} + catalog_name: ${var.catalog_name} \ No newline at end of file diff --git a/filepush/dab/resources/volume.yml b/filepush/dab/resources/volume.yml new file mode 100644 index 00000000..b479d607 --- /dev/null +++ b/filepush/dab/resources/volume.yml @@ -0,0 +1,8 @@ +# The file staging volume for schema dab + +resources: + volumes: + filepush_volume: + name: ${var.schema_name}_filepush_volume + catalog_name: ${var.catalog_name} + schema_name: ${var.schema_name} diff --git a/filepush/dab/src/initialization.py b/filepush/dab/src/initialization.py new file mode 100644 index 00000000..e69de29b diff --git a/filepush/dab/src/pipelines/dab_readfiles_kernel.sql b/filepush/dab/src/pipelines/dab_readfiles_kernel.sql new file mode 100644 index 00000000..55666737 --- /dev/null +++ b/filepush/dab/src/pipelines/dab_readfiles_kernel.sql @@ -0,0 +1,15 @@ +-- Kernel template for read_files +SELECT + * +FROM + read_files( + '%s', + ignoreCorruptFiles => 'true', -- If a different malformed file pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. + ignoreMissingFiles => 'true' -- If a different file format is accidentally pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. + -- Do not change anything above + -- Add any additional options below + -- Example: + -- , + -- header => 'true', + -- escape => '"' + ) diff --git a/filepush/dab/src/pipelines/ingestion.py b/filepush/dab/src/pipelines/ingestion.py new file mode 100644 index 00000000..e69de29b From 7b5c85f6e95bd74b99f627c965edc100610bfb77 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 12 Sep 2025 16:52:53 -0700 Subject: [PATCH 25/60] Add path property if possible --- filepush/dab/databricks.yml | 4 ++++ filepush/dab/resources/job.yml | 7 +++++-- filepush/dab/resources/pipeline.yml | 4 +++- filepush/dab/resources/schema.yml | 2 +- filepush/dab/resources/volume.yml | 2 +- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/filepush/dab/databricks.yml b/filepush/dab/databricks.yml index 28003832..411ab284 100644 --- a/filepush/dab/databricks.yml +++ b/filepush/dab/databricks.yml @@ -34,3 +34,7 @@ variables: schema_name: description: The name of the schema where the tables and ingestion pipeline will be created. default: filepushschema + resource_name_prefix: + description: The prefix for the resource names. + default: ${var.catalog_name}_${var.schema_name}_ + \ No newline at end of file diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml index 08b13bb6..1648ed6a 100644 --- a/filepush/dab/resources/job.yml +++ b/filepush/dab/resources/job.yml @@ -4,13 +4,16 @@ resources: jobs: filetrigger_job: - name: ${var.schema_name}_filetrigger_job + name: ${var.resource_name_prefix}filetrigger_job tasks: - task_key: pipeline_refresh pipeline_task: pipeline_id: ${resources.pipelines.refresh_pipeline.id} + trigger: + file_arrival: + url: ${resources.volumes.filepush_volume.volume_path}/data/ configuration_job: - name: ${var.schema_name}_configuration_job + name: ${var.resource_name_prefix}configuration_job tasks: - task_key: initialization spark_python_task: diff --git a/filepush/dab/resources/pipeline.yml b/filepush/dab/resources/pipeline.yml index f1dee3b1..84a74f67 100644 --- a/filepush/dab/resources/pipeline.yml +++ b/filepush/dab/resources/pipeline.yml @@ -3,10 +3,12 @@ resources: pipelines: refresh_pipeline: - name: ${var.schema_name}_refresh_pipeline + name: ${var.resource_name_prefix}refresh_pipeline catalog: ${var.catalog_name} schema: ${var.schema_name} serverless: true libraries: - file: path: ../src/pipelines/ingestion.py + configuration: + filepush.volume_path: ${resources.volumes.filepush_volume.volume_path} diff --git a/filepush/dab/resources/schema.yml b/filepush/dab/resources/schema.yml index 28eae88a..72500a02 100644 --- a/filepush/dab/resources/schema.yml +++ b/filepush/dab/resources/schema.yml @@ -4,4 +4,4 @@ resources: schemas: main_schema: name: ${var.schema_name} - catalog_name: ${var.catalog_name} \ No newline at end of file + catalog_name: ${var.catalog_name} diff --git a/filepush/dab/resources/volume.yml b/filepush/dab/resources/volume.yml index b479d607..ac8929c8 100644 --- a/filepush/dab/resources/volume.yml +++ b/filepush/dab/resources/volume.yml @@ -3,6 +3,6 @@ resources: volumes: filepush_volume: - name: ${var.schema_name}_filepush_volume + name: ${var.resource_name_prefix}filepush_volume catalog_name: ${var.catalog_name} schema_name: ${var.schema_name} From 14657340a3f66813feec8b284937819d8223bfe5 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 10:08:17 -0700 Subject: [PATCH 26/60] Successfully pass in parameters --- filepush/dab/resources/job.yml | 12 ++++++++++++ filepush/dab/src/initialization.py | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml index 1648ed6a..744ba835 100644 --- a/filepush/dab/resources/job.yml +++ b/filepush/dab/resources/job.yml @@ -18,11 +18,23 @@ resources: - task_key: initialization spark_python_task: python_file: ../src/initialization.py + parameters: + - "--catalog_name" + - "{{job.parameters.catalog_name}}" + - "--schema_name" + - "{{job.parameters.schema_name}}" environment_key: serverless - task_key: trigger_refresh run_job_task: job_id: ${resources.jobs.filetrigger_job.id} + depends_on: + - task_key: initialization environments: - environment_key: serverless spec: client: "3" + parameters: + - name: catalog_name + default: ${var.catalog_name} + - name: schema_name + default: ${resources.schemas.main_schema.name} diff --git a/filepush/dab/src/initialization.py b/filepush/dab/src/initialization.py index e69de29b..46b10aaa 100644 --- a/filepush/dab/src/initialization.py +++ b/filepush/dab/src/initialization.py @@ -0,0 +1,9 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--catalog_name", type=str, required=True) +parser.add_argument("--schema_name", type=str, required=True) +args = parser.parse_args() + +print(f"Catalog: {args.catalog_name}") +print(f"Schema: {args.schema_name}") \ No newline at end of file From 9eb37b7764dd66095d5dfdb530dc17377c55aff0 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 13:44:20 -0700 Subject: [PATCH 27/60] Create basic folder structure in volume --- filepush/dab/resources/job.yml | 10 ++++++++ filepush/dab/resources/pipeline.yml | 3 ++- filepush/dab/src/initialization.py | 39 +++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml index 744ba835..ad6d1568 100644 --- a/filepush/dab/resources/job.yml +++ b/filepush/dab/resources/job.yml @@ -23,6 +23,12 @@ resources: - "{{job.parameters.catalog_name}}" - "--schema_name" - "{{job.parameters.schema_name}}" + - "--volume_path_root" + - "{{job.parameters.volume_path_root}}" + - "--volume_path_data" + - "{{job.parameters.volume_path_data}}" + - "--logging_level" + - "${bundle.target}" environment_key: serverless - task_key: trigger_refresh run_job_task: @@ -38,3 +44,7 @@ resources: default: ${var.catalog_name} - name: schema_name default: ${resources.schemas.main_schema.name} + - name: volume_path_root + default: ${resources.volumes.filepush_volume.volume_path} + - name: volume_path_data + default: ${resources.volumes.filepush_volume.volume_path}/data diff --git a/filepush/dab/resources/pipeline.yml b/filepush/dab/resources/pipeline.yml index 84a74f67..bb0d1757 100644 --- a/filepush/dab/resources/pipeline.yml +++ b/filepush/dab/resources/pipeline.yml @@ -11,4 +11,5 @@ resources: - file: path: ../src/pipelines/ingestion.py configuration: - filepush.volume_path: ${resources.volumes.filepush_volume.volume_path} + filepush.volume_path_root: ${resources.volumes.filepush_volume.volume_path} + filepush.volume_path_data: ${resources.volumes.filepush_volume.volume_path}/data diff --git a/filepush/dab/src/initialization.py b/filepush/dab/src/initialization.py index 46b10aaa..eaeea593 100644 --- a/filepush/dab/src/initialization.py +++ b/filepush/dab/src/initialization.py @@ -1,9 +1,44 @@ import argparse +import logging +from databricks.sdk import WorkspaceClient +# Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--catalog_name", type=str, required=True) parser.add_argument("--schema_name", type=str, required=True) +parser.add_argument("--volume_path_root", type=str, required=True) +parser.add_argument("--volume_path_data", type=str, required=True) +parser.add_argument("--logging_level", type=str, required=False, default="dev") args = parser.parse_args() -print(f"Catalog: {args.catalog_name}") -print(f"Schema: {args.schema_name}") \ No newline at end of file +catalog_name = args.catalog_name +schema_name = args.schema_name +volume_path_root = args.volume_path_root +volume_path_data = args.volume_path_data +logging_level = logging.DEBUG if args.logging_level == "dev" else logging.INFO + +# Logging +logging.basicConfig( + level=logging_level, + format="%(asctime)s [%(levelname)s] %(module)s - %(message)s" +) +logger = logging.getLogger(__name__) # per-module logger + +# Initialize workspace client +ws = WorkspaceClient() + +# Set property to schema +logger.info(f"Setting property to schema {catalog_name}.{schema_name}") +logger.debug(f"Volume path root: {volume_path_root}") +logger.debug(f"Volume path data: {volume_path_data}") +ws.schemas.update(full_name=f"{catalog_name}.{schema_name}", properties={ + "filepush.volume_path_root": volume_path_root, + "filepush.volume_path_data": volume_path_data +}) +logger.info(f"Schema {catalog_name}.{schema_name} configured") + +# Initialize volume folder structure +logger.info(f"Initializing volume folder structure {volume_path_root}") +logger.debug(f"Creating volume directory {volume_path_data}") +ws.files.create_directory(volume_path_data) +logger.info(f"Volume {volume_path_root} configured") From 53788238fed4bce057117f763ebde6b66a12ea1f Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 15:33:04 -0700 Subject: [PATCH 28/60] Infer data path --- filepush/dab/resources/job.yml | 4 ---- filepush/dab/resources/pipeline.yml | 3 +-- filepush/dab/src/initialization.py | 5 ++--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml index ad6d1568..3fc21603 100644 --- a/filepush/dab/resources/job.yml +++ b/filepush/dab/resources/job.yml @@ -25,8 +25,6 @@ resources: - "{{job.parameters.schema_name}}" - "--volume_path_root" - "{{job.parameters.volume_path_root}}" - - "--volume_path_data" - - "{{job.parameters.volume_path_data}}" - "--logging_level" - "${bundle.target}" environment_key: serverless @@ -46,5 +44,3 @@ resources: default: ${resources.schemas.main_schema.name} - name: volume_path_root default: ${resources.volumes.filepush_volume.volume_path} - - name: volume_path_data - default: ${resources.volumes.filepush_volume.volume_path}/data diff --git a/filepush/dab/resources/pipeline.yml b/filepush/dab/resources/pipeline.yml index bb0d1757..037b8061 100644 --- a/filepush/dab/resources/pipeline.yml +++ b/filepush/dab/resources/pipeline.yml @@ -5,11 +5,10 @@ resources: refresh_pipeline: name: ${var.resource_name_prefix}refresh_pipeline catalog: ${var.catalog_name} - schema: ${var.schema_name} + schema: ${resources.schemas.main_schema.name} serverless: true libraries: - file: path: ../src/pipelines/ingestion.py configuration: filepush.volume_path_root: ${resources.volumes.filepush_volume.volume_path} - filepush.volume_path_data: ${resources.volumes.filepush_volume.volume_path}/data diff --git a/filepush/dab/src/initialization.py b/filepush/dab/src/initialization.py index eaeea593..351927fd 100644 --- a/filepush/dab/src/initialization.py +++ b/filepush/dab/src/initialization.py @@ -7,14 +7,13 @@ parser.add_argument("--catalog_name", type=str, required=True) parser.add_argument("--schema_name", type=str, required=True) parser.add_argument("--volume_path_root", type=str, required=True) -parser.add_argument("--volume_path_data", type=str, required=True) parser.add_argument("--logging_level", type=str, required=False, default="dev") args = parser.parse_args() catalog_name = args.catalog_name schema_name = args.schema_name volume_path_root = args.volume_path_root -volume_path_data = args.volume_path_data +volume_path_data = args.volume_path_root + "/data" logging_level = logging.DEBUG if args.logging_level == "dev" else logging.INFO # Logging @@ -39,6 +38,6 @@ # Initialize volume folder structure logger.info(f"Initializing volume folder structure {volume_path_root}") -logger.debug(f"Creating volume directory {volume_path_data}") +logger.debug(f"Creating data directory {volume_path_data}") ws.files.create_directory(volume_path_data) logger.info(f"Volume {volume_path_root} configured") From ebff04a5a9975ee947ad9928c3161c02d2658d9f Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 16:18:14 -0700 Subject: [PATCH 29/60] tidy up directory --- filepush/dab/resources/pipeline.yml | 3 ++- filepush/dab/src/{pipelines => }/ingestion.py | 0 .../dab/src/pipelines/dab_readfiles_kernel.sql | 15 --------------- 3 files changed, 2 insertions(+), 16 deletions(-) rename filepush/dab/src/{pipelines => }/ingestion.py (100%) delete mode 100644 filepush/dab/src/pipelines/dab_readfiles_kernel.sql diff --git a/filepush/dab/resources/pipeline.yml b/filepush/dab/resources/pipeline.yml index 037b8061..e30c4ae6 100644 --- a/filepush/dab/resources/pipeline.yml +++ b/filepush/dab/resources/pipeline.yml @@ -9,6 +9,7 @@ resources: serverless: true libraries: - file: - path: ../src/pipelines/ingestion.py + path: ../src/ingestion.py + root_path: ../src configuration: filepush.volume_path_root: ${resources.volumes.filepush_volume.volume_path} diff --git a/filepush/dab/src/pipelines/ingestion.py b/filepush/dab/src/ingestion.py similarity index 100% rename from filepush/dab/src/pipelines/ingestion.py rename to filepush/dab/src/ingestion.py diff --git a/filepush/dab/src/pipelines/dab_readfiles_kernel.sql b/filepush/dab/src/pipelines/dab_readfiles_kernel.sql deleted file mode 100644 index 55666737..00000000 --- a/filepush/dab/src/pipelines/dab_readfiles_kernel.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Kernel template for read_files -SELECT - * -FROM - read_files( - '%s', - ignoreCorruptFiles => 'true', -- If a different malformed file pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. - ignoreMissingFiles => 'true' -- If a different file format is accidentally pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. - -- Do not change anything above - -- Add any additional options below - -- Example: - -- , - -- header => 'true', - -- escape => '"' - ) From 802549cbda2729fa66b21f604be4a18c7cbf4aa5 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 20:53:34 -0700 Subject: [PATCH 30/60] Migrate filed and add initialization script --- filepush/dab/resources/job.yml | 2 +- filepush/dab/src/configs/tables.json | 11 +++++++ filepush/dab/src/debug_table.py | 31 +++++++++++++++++++ .../dab/src/{ => utils}/initialization.py | 12 ++++++- 4 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 filepush/dab/src/configs/tables.json create mode 100644 filepush/dab/src/debug_table.py rename filepush/dab/src/{ => utils}/initialization.py (84%) diff --git a/filepush/dab/resources/job.yml b/filepush/dab/resources/job.yml index 3fc21603..f8fdaac9 100644 --- a/filepush/dab/resources/job.yml +++ b/filepush/dab/resources/job.yml @@ -17,7 +17,7 @@ resources: tasks: - task_key: initialization spark_python_task: - python_file: ../src/initialization.py + python_file: ../src/utils/initialization.py parameters: - "--catalog_name" - "{{job.parameters.catalog_name}}" diff --git a/filepush/dab/src/configs/tables.json b/filepush/dab/src/configs/tables.json new file mode 100644 index 00000000..81cb1560 --- /dev/null +++ b/filepush/dab/src/configs/tables.json @@ -0,0 +1,11 @@ +[ + { + "name": "dummy", + "format": "csv", + "format_options": { + "header": "true", + "escape": "\"" + }, + "schema_hints": "id int, name string" + } +] diff --git a/filepush/dab/src/debug_table.py b/filepush/dab/src/debug_table.py new file mode 100644 index 00000000..ef8559ec --- /dev/null +++ b/filepush/dab/src/debug_table.py @@ -0,0 +1,31 @@ +# Databricks notebook source +import json +import os + +# Widget +dbutils.widgets.text("table_name", "", "Table Name") + +# Load configs to environment json +environment_path = "./configs/environment.json" +table_configs_path = "./configs/tables.json" + +assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" +assert os.path.exists(table_configs_path), f"Missing table configs file: {table_configs_path}" + +with open(environment_path, "r") as f: + configs = json.load(f) +with open(table_configs_path, "r") as f: + table_configs = json.load(f) + +catalog_name = configs["catalog_name"] +schema_name = configs["schema_name"] +table_name = dbutils.widgets.get("table_name") +table_volume_path_data = configs["volume_path_data"] + f"/{table_name}" + +# Locate table config +matches = [table_config for table_config in table_configs if table_config.get("name") == "dummy"] +assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file." +table_config = matches[0] + +print(f"Table Volume Path: {table_volume_path_data}") +print(f"Table Config: {table_config}") diff --git a/filepush/dab/src/initialization.py b/filepush/dab/src/utils/initialization.py similarity index 84% rename from filepush/dab/src/initialization.py rename to filepush/dab/src/utils/initialization.py index 351927fd..28b7fe64 100644 --- a/filepush/dab/src/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -1,6 +1,7 @@ +from databricks.sdk import WorkspaceClient import argparse +import json import logging -from databricks.sdk import WorkspaceClient # Parse arguments parser = argparse.ArgumentParser() @@ -41,3 +42,12 @@ logger.debug(f"Creating data directory {volume_path_data}") ws.files.create_directory(volume_path_data) logger.info(f"Volume {volume_path_root} configured") + +# Dump configs to environment json +with open("./configs/environment.json", "w") as f: + json.dump({ + "catalog_name": catalog_name, + "schema_name": schema_name, + "volume_path_root": volume_path_root, + "volume_path_data": volume_path_data + }, f) From 5ef1e055afeb9200fe4cbdcad4374070c25847c2 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 20:56:14 -0700 Subject: [PATCH 31/60] Fix relative path issue --- filepush/dab/src/utils/initialization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filepush/dab/src/utils/initialization.py b/filepush/dab/src/utils/initialization.py index 28b7fe64..2d7459f4 100644 --- a/filepush/dab/src/utils/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -44,7 +44,7 @@ logger.info(f"Volume {volume_path_root} configured") # Dump configs to environment json -with open("./configs/environment.json", "w") as f: +with open("../configs/environment.json", "w") as f: json.dump({ "catalog_name": catalog_name, "schema_name": schema_name, From b9546d06098b2793c2612153e0246ca1c1cac7c1 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 15 Sep 2025 21:51:53 -0700 Subject: [PATCH 32/60] Add config manager and a working debug notebook --- filepush/dab/src/debug_table.py | 14 +++++++++++++- filepush/dab/src/utils/configmanager.py | 0 filepush/dab/src/utils/initialization.py | 5 +++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 filepush/dab/src/utils/configmanager.py diff --git a/filepush/dab/src/debug_table.py b/filepush/dab/src/debug_table.py index ef8559ec..a7343fe3 100644 --- a/filepush/dab/src/debug_table.py +++ b/filepush/dab/src/debug_table.py @@ -20,12 +20,24 @@ catalog_name = configs["catalog_name"] schema_name = configs["schema_name"] table_name = dbutils.widgets.get("table_name") +assert table_name, "Please provide a table name" table_volume_path_data = configs["volume_path_data"] + f"/{table_name}" # Locate table config -matches = [table_config for table_config in table_configs if table_config.get("name") == "dummy"] +matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file." table_config = matches[0] print(f"Table Volume Path: {table_volume_path_data}") print(f"Table Config: {table_config}") + +# COMMAND ---------- + +import tempfile +from utils import configmanager + +with tempfile.TemporaryDirectory() as tmpdir: + reader = spark.readStream.format("cloudFiles") + reader = configmanager.apply_table_config(reader, table_config) + reader.option("cloudFiles.schemaLocation", tmpdir) + display(reader.load(table_volume_path_data)) \ No newline at end of file diff --git a/filepush/dab/src/utils/configmanager.py b/filepush/dab/src/utils/configmanager.py new file mode 100644 index 00000000..e69de29b diff --git a/filepush/dab/src/utils/initialization.py b/filepush/dab/src/utils/initialization.py index 2d7459f4..a2fdd6d6 100644 --- a/filepush/dab/src/utils/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -41,6 +41,11 @@ logger.info(f"Initializing volume folder structure {volume_path_root}") logger.debug(f"Creating data directory {volume_path_data}") ws.files.create_directory(volume_path_data) +with open("../configs/tables.json", "r") as f: + for table in json.load(f): + table_volume_path_data = {volume_path_data}/{table['name']} + logger.debug(f"Creating table directory {table_volume_path_data}") + ws.files.create_directory(table_volume_path_data) logger.info(f"Volume {volume_path_root} configured") # Dump configs to environment json From 299997ee685d88ed5bca23c3912431fabef42c71 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 16 Sep 2025 11:25:59 -0700 Subject: [PATCH 33/60] Working debug notebook --- filepush/dab/src/debug_table.py | 43 ------------ filepush/dab/src/debug_table_config.py | 85 ++++++++++++++++++++++++ filepush/dab/src/utils/configmanager.py | 18 +++++ filepush/dab/src/utils/initialization.py | 28 ++++---- 4 files changed, 117 insertions(+), 57 deletions(-) delete mode 100644 filepush/dab/src/debug_table.py create mode 100644 filepush/dab/src/debug_table_config.py diff --git a/filepush/dab/src/debug_table.py b/filepush/dab/src/debug_table.py deleted file mode 100644 index a7343fe3..00000000 --- a/filepush/dab/src/debug_table.py +++ /dev/null @@ -1,43 +0,0 @@ -# Databricks notebook source -import json -import os - -# Widget -dbutils.widgets.text("table_name", "", "Table Name") - -# Load configs to environment json -environment_path = "./configs/environment.json" -table_configs_path = "./configs/tables.json" - -assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" -assert os.path.exists(table_configs_path), f"Missing table configs file: {table_configs_path}" - -with open(environment_path, "r") as f: - configs = json.load(f) -with open(table_configs_path, "r") as f: - table_configs = json.load(f) - -catalog_name = configs["catalog_name"] -schema_name = configs["schema_name"] -table_name = dbutils.widgets.get("table_name") -assert table_name, "Please provide a table name" -table_volume_path_data = configs["volume_path_data"] + f"/{table_name}" - -# Locate table config -matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] -assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file." -table_config = matches[0] - -print(f"Table Volume Path: {table_volume_path_data}") -print(f"Table Config: {table_config}") - -# COMMAND ---------- - -import tempfile -from utils import configmanager - -with tempfile.TemporaryDirectory() as tmpdir: - reader = spark.readStream.format("cloudFiles") - reader = configmanager.apply_table_config(reader, table_config) - reader.option("cloudFiles.schemaLocation", tmpdir) - display(reader.load(table_volume_path_data)) \ No newline at end of file diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py new file mode 100644 index 00000000..08259d2f --- /dev/null +++ b/filepush/dab/src/debug_table_config.py @@ -0,0 +1,85 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Assign the table config JSON you would like to debug to variable `table_config` +# MAGIC For example, +# MAGIC ``` +# MAGIC table_config = r''' +# MAGIC { +# MAGIC "name": "all_employees", +# MAGIC "format": "csv", +# MAGIC "format_options": { +# MAGIC "header": "true", +# MAGIC "escape": "\"" +# MAGIC } +# MAGIC "schema_hints": "id int, name string" +# MAGIC } +# MAGIC ''' +# MAGIC ``` + +# COMMAND ---------- + +table_config = r''' +{ + "name": "all_employees", + "format": "csv", + "format_options": { + "header": "true", + "escape": "\"" + }, + "schema_hints": "id int, name string" +} +''' + +# COMMAND ---------- + +import json +import os +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import NotFound + +# Initialize workspace client +ws = WorkspaceClient() + +# Load configs from environment json +environment_path = "./configs/environment.json" +assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" +with open(environment_path, "r") as f: + configs = json.load(f) + +catalog_name = configs["catalog_name"] +schema_name = configs["schema_name"] +table_config_json = json.loads(table_config) +table_name = table_config_json["name"] +assert table_name, "Please provide a table name in the table_config" + +# Load table configs +table_configs_path = "./configs/tables.json" +assert os.path.exists(table_configs_path), f"Missing table configs file: {table_configs_path}. Please following README.md to create one, deploy and run configuration_job." +with open(table_configs_path, "r") as f: + table_configs = json.load(f) +matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] +assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file and run configuration_job" + +table_volume_path_data = configs["volume_path_data"] + f"/{table_name}" +try: + ws.files.get_directory_metadata(table_volume_path_data) + iter = ws.files.list_directory_contents(table_volume_path_data) + next(iter) +except NotFound: + assert False, f"Table data path not found for table `{table_name}`. Have you run `databricks bundle run configuration_job`?" +except StopIteration: + assert False, f"No data file found in {table_volume_path_data}. Please upload at least 1 file." + +print(f"Table Volume Path: {table_volume_path_data}") +print(f"Table Config:\n{table_config}") + +# COMMAND ---------- + +import tempfile +from utils import configmanager + +with tempfile.TemporaryDirectory() as tmpdir: + reader = spark.readStream.format("cloudFiles") + reader = configmanager.apply_table_config(reader, table_config_json) + reader.option("cloudFiles.schemaLocation", tmpdir) + display(reader.load(table_volume_path_data)) diff --git a/filepush/dab/src/utils/configmanager.py b/filepush/dab/src/utils/configmanager.py index e69de29b..4817ba5e 100644 --- a/filepush/dab/src/utils/configmanager.py +++ b/filepush/dab/src/utils/configmanager.py @@ -0,0 +1,18 @@ +from pyspark.sql.streaming import DataStreamReader + +def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: + fmt = table_config.get("format") + assert fmt is not None, f"format is required for table {table_config.get('name')}" + reader = reader.option("cloudFiles.format", fmt) + + # format-specific options + fmt_opts = table_config.get("format_options", {}) + for k, v in fmt_opts.items(): + reader = reader.option(k, v) + + # schema hints + schema_hints = table_config.get("schema_hints") + if schema_hints: + reader = reader.option("cloudFiles.schemaHints", schema_hints) + + return reader \ No newline at end of file diff --git a/filepush/dab/src/utils/initialization.py b/filepush/dab/src/utils/initialization.py index a2fdd6d6..09f79947 100644 --- a/filepush/dab/src/utils/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -19,8 +19,8 @@ # Logging logging.basicConfig( - level=logging_level, - format="%(asctime)s [%(levelname)s] %(module)s - %(message)s" + level=logging_level, + format="%(asctime)s [%(levelname)s] %(module)s - %(message)s" ) logger = logging.getLogger(__name__) # per-module logger @@ -32,8 +32,8 @@ logger.debug(f"Volume path root: {volume_path_root}") logger.debug(f"Volume path data: {volume_path_data}") ws.schemas.update(full_name=f"{catalog_name}.{schema_name}", properties={ - "filepush.volume_path_root": volume_path_root, - "filepush.volume_path_data": volume_path_data + "filepush.volume_path_root": volume_path_root, + "filepush.volume_path_data": volume_path_data }) logger.info(f"Schema {catalog_name}.{schema_name} configured") @@ -42,17 +42,17 @@ logger.debug(f"Creating data directory {volume_path_data}") ws.files.create_directory(volume_path_data) with open("../configs/tables.json", "r") as f: - for table in json.load(f): - table_volume_path_data = {volume_path_data}/{table['name']} - logger.debug(f"Creating table directory {table_volume_path_data}") - ws.files.create_directory(table_volume_path_data) + for table in json.load(f): + table_volume_path_data = f"{volume_path_data}/{table['name']}" + logger.debug(f"Creating table directory {table_volume_path_data}") + ws.files.create_directory(table_volume_path_data) logger.info(f"Volume {volume_path_root} configured") # Dump configs to environment json with open("../configs/environment.json", "w") as f: - json.dump({ - "catalog_name": catalog_name, - "schema_name": schema_name, - "volume_path_root": volume_path_root, - "volume_path_data": volume_path_data - }, f) + json.dump({ + "catalog_name": catalog_name, + "schema_name": schema_name, + "volume_path_root": volume_path_root, + "volume_path_data": volume_path_data + }, f) From e7eb8efaaac7502aa93b88ec4a85c4ff1907d8f1 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 16 Sep 2025 16:33:07 -0700 Subject: [PATCH 34/60] Refactor managers --- filepush/dab/src/utils/envmanager.py | 9 +++++++++ .../dab/src/utils/{configmanager.py => tablemanager.py} | 5 +++++ 2 files changed, 14 insertions(+) create mode 100644 filepush/dab/src/utils/envmanager.py rename filepush/dab/src/utils/{configmanager.py => tablemanager.py} (77%) diff --git a/filepush/dab/src/utils/envmanager.py b/filepush/dab/src/utils/envmanager.py new file mode 100644 index 00000000..7a92547c --- /dev/null +++ b/filepush/dab/src/utils/envmanager.py @@ -0,0 +1,9 @@ +import os +import json + +def get_env_config() -> dict: + environment_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "environment.json") + assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" + with open(environment_path, "r") as f: + configs = json.load(f) + return configs diff --git a/filepush/dab/src/utils/configmanager.py b/filepush/dab/src/utils/tablemanager.py similarity index 77% rename from filepush/dab/src/utils/configmanager.py rename to filepush/dab/src/utils/tablemanager.py index 4817ba5e..d25ce408 100644 --- a/filepush/dab/src/utils/configmanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -1,5 +1,10 @@ + from pyspark.sql.streaming import DataStreamReader +def get_table_configs() -> dict: + config_path = json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "config.json") + return load_json(config_path) + def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: fmt = table_config.get("format") assert fmt is not None, f"format is required for table {table_config.get('name')}" From a4cbc520d4492eeae7793259f6be42da25569964 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 16 Sep 2025 17:42:50 -0700 Subject: [PATCH 35/60] Successfully create placeholder table --- filepush/dab/src/debug_table_config.py | 44 +++++++------------------- filepush/dab/src/ingestion.py | 18 +++++++++++ filepush/dab/src/utils/envmanager.py | 8 ++--- filepush/dab/src/utils/tablemanager.py | 44 +++++++++++++++++++++++--- 4 files changed, 73 insertions(+), 41 deletions(-) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 08259d2f..7e003c3f 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -1,6 +1,6 @@ # Databricks notebook source # MAGIC %md -# MAGIC # Assign the table config JSON you would like to debug to variable `table_config` +# MAGIC # Paste the table config JSON you would like to debug and assign to variable `table_config` # MAGIC For example, # MAGIC ``` # MAGIC table_config = r''' @@ -34,52 +34,32 @@ import json import os -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors.platform import NotFound - -# Initialize workspace client -ws = WorkspaceClient() +from utils import envmanager +from utils import tablemanager # Load configs from environment json -environment_path = "./configs/environment.json" -assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" -with open(environment_path, "r") as f: - configs = json.load(f) +config = envmanager.get_config() +catalog_name = config["catalog_name"] +schema_name = config["schema_name"] -catalog_name = configs["catalog_name"] -schema_name = configs["schema_name"] +# Load table configs table_config_json = json.loads(table_config) table_name = table_config_json["name"] -assert table_name, "Please provide a table name in the table_config" - -# Load table configs -table_configs_path = "./configs/tables.json" -assert os.path.exists(table_configs_path), f"Missing table configs file: {table_configs_path}. Please following README.md to create one, deploy and run configuration_job." -with open(table_configs_path, "r") as f: - table_configs = json.load(f) +assert table_name, "Please provide a table name in the table_config json" +table_configs = tablemanager.get_configs() matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file and run configuration_job" - -table_volume_path_data = configs["volume_path_data"] + f"/{table_name}" -try: - ws.files.get_directory_metadata(table_volume_path_data) - iter = ws.files.list_directory_contents(table_volume_path_data) - next(iter) -except NotFound: - assert False, f"Table data path not found for table `{table_name}`. Have you run `databricks bundle run configuration_job`?" -except StopIteration: - assert False, f"No data file found in {table_volume_path_data}. Please upload at least 1 file." +table_volume_path_data = tablemanager.get_table_volume_path(table_name) +assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file." print(f"Table Volume Path: {table_volume_path_data}") -print(f"Table Config:\n{table_config}") # COMMAND ---------- import tempfile -from utils import configmanager with tempfile.TemporaryDirectory() as tmpdir: reader = spark.readStream.format("cloudFiles") - reader = configmanager.apply_table_config(reader, table_config_json) + reader = tablemanager.apply_table_config(reader, table_config_json) reader.option("cloudFiles.schemaLocation", tmpdir) display(reader.load(table_volume_path_data)) diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index e69de29b..72f11b7d 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -0,0 +1,18 @@ +import dlt +from utils import tablemanager + +table_configs = tablemanager.get_configs() + +for table_config in table_configs: + tablemanager.validate_config(table_config) + table_name = table_config['name'] + table_volume_path = tablemanager.get_table_volume_path(table_name) + @dlt.table( + name = table_name, + comment = "File push created table", + table_properties = {"filepush.table_volume_path_data": tablemanager.get_table_volume_path(table_name)} + ) + def create_table(): + reader = spark.readStream.format("cloudFiles") + reader = tablemanager.apply_table_config(reader, table_config) + return reader.load(table_volume_path) \ No newline at end of file diff --git a/filepush/dab/src/utils/envmanager.py b/filepush/dab/src/utils/envmanager.py index 7a92547c..0eee314b 100644 --- a/filepush/dab/src/utils/envmanager.py +++ b/filepush/dab/src/utils/envmanager.py @@ -1,9 +1,9 @@ import os import json -def get_env_config() -> dict: - environment_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "environment.json") - assert os.path.exists(environment_path), f"Missing environment file: {environment_path}. Have you run `databricks bundle run configuration_job`?" - with open(environment_path, "r") as f: +def get_config() -> dict: + json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "environment.json") + assert os.path.exists(json_path), f"Missing environment file: {json_path}. Have you run `databricks bundle run configuration_job`?" + with open(json_path, "r") as f: configs = json.load(f) return configs diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index d25ce408..fae57a6e 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -1,9 +1,40 @@ - +import os +import json +from . import envmanager from pyspark.sql.streaming import DataStreamReader +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import NotFound + +def validate_config(config: dict): + pass + +def get_table_volume_path(table_name: str) -> str: + # Initialize workspace client + ws = WorkspaceClient() + table_volume_path_data = os.path.join(envmanager.get_config()["volume_path_data"], table_name) + try: + ws.files.get_directory_metadata(table_volume_path_data) + except NotFound: + assert False, f"Table data path not found for table `{table_name}`. Have you run `databricks bundle run configuration_job`?" + return table_volume_path_data + +def has_data_file(table_name: str) -> bool: + # Initialize workspace client + ws = WorkspaceClient() + table_volume_path_data = get_table_volume_path(table_name) + try: + iter = ws.files.list_directory_contents(table_volume_path_data) + next(iter) + except StopIteration: + return False + return True -def get_table_configs() -> dict: - config_path = json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "config.json") - return load_json(config_path) +def get_configs() -> list: + json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") + assert os.path.exists(json_path), f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job." + with open(json_path, "r") as f: + configs = json.load(f) + return configs def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: fmt = table_config.get("format") @@ -16,8 +47,11 @@ def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStre reader = reader.option(k, v) # schema hints + # always have _rescued_data + reader = reader.schema("_rescued_data STRING") schema_hints = table_config.get("schema_hints") if schema_hints: reader = reader.option("cloudFiles.schemaHints", schema_hints) - return reader \ No newline at end of file + return reader + \ No newline at end of file From 322bc4827f80cf6681b6ec6c9188ca7eab608912 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 17 Sep 2025 14:29:18 -0700 Subject: [PATCH 36/60] Solve the empty table DLT resolve issue --- filepush/dab/src/ingestion.py | 13 ++++++++++--- filepush/dab/src/utils/tablemanager.py | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 72f11b7d..99f05c1a 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -7,12 +7,19 @@ tablemanager.validate_config(table_config) table_name = table_config['name'] table_volume_path = tablemanager.get_table_volume_path(table_name) - @dlt.table( + + dlt.create_streaming_table( name = table_name, comment = "File push created table", table_properties = {"filepush.table_volume_path_data": tablemanager.get_table_volume_path(table_name)} ) - def create_table(): + if not tablemanager.has_data_file(table_name) and not tablemanager.is_table_created(table_name): + @dlt.append_flow(target = table_name) + def noop_to_table(): + return tablemanager.get_placeholder_stream(spark.readStream) + else: + @dlt.append_flow(target = table_name) + def append_to_table(): reader = spark.readStream.format("cloudFiles") reader = tablemanager.apply_table_config(reader, table_config) - return reader.load(table_volume_path) \ No newline at end of file + return reader.load(table_volume_path) diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index fae57a6e..d5790cac 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -29,6 +29,19 @@ def has_data_file(table_name: str) -> bool: return False return True +def is_table_created(table_name: str) -> bool: + # Initialize workspace client + ws = WorkspaceClient() + return ws.tables.exists(full_name=f"{envmanager.get_config()["catalog_name"]}.{envmanager.get_config()["schema_name"]}.{table_name}").table_exists + +def get_placeholder_stream(reader: DataStreamReader) -> DataStreamReader: + # Streaming source that produces empty micro-batches (but is STILL streaming) + return ( + reader.format("rate").option("rowsPerSecond", 1).load() + .selectExpr("CAST(NULL AS STRING) AS _rescued_data") + .where("1=0") # no rows, just preserves streaming lineage + ) + def get_configs() -> list: json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") assert os.path.exists(json_path), f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job." @@ -47,8 +60,6 @@ def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStre reader = reader.option(k, v) # schema hints - # always have _rescued_data - reader = reader.schema("_rescued_data STRING") schema_hints = table_config.get("schema_hints") if schema_hints: reader = reader.option("cloudFiles.schemaHints", schema_hints) From 04b1662fa3682cdef321247847839b18fec7c8b5 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 17 Sep 2025 15:28:57 -0700 Subject: [PATCH 37/60] Better way to solve the empty DLT resolve --- filepush/dab/src/debug_table_config.py | 22 +++++++++++----------- filepush/dab/src/ingestion.py | 17 +++++++---------- filepush/dab/src/utils/tablemanager.py | 10 +--------- 3 files changed, 19 insertions(+), 30 deletions(-) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 7e003c3f..2e697a94 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -19,15 +19,15 @@ # COMMAND ---------- table_config = r''' -{ - "name": "all_employees", - "format": "csv", - "format_options": { - "header": "true", - "escape": "\"" - }, - "schema_hints": "id int, name string" -} + { + "name": "dummy", + "format": "csv", + "format_options": { + "header": "true", + "escape": "\"" + }, + "schema_hints": "id int, name string" + } ''' # COMMAND ---------- @@ -50,7 +50,7 @@ matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file and run configuration_job" table_volume_path_data = tablemanager.get_table_volume_path(table_name) -assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file." +assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file to {table_volume_path_data}" print(f"Table Volume Path: {table_volume_path_data}") @@ -62,4 +62,4 @@ reader = spark.readStream.format("cloudFiles") reader = tablemanager.apply_table_config(reader, table_config_json) reader.option("cloudFiles.schemaLocation", tmpdir) - display(reader.load(table_volume_path_data)) + display(reader.load(table_volume_path_data)) \ No newline at end of file diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 99f05c1a..622cc3d6 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -13,13 +13,10 @@ comment = "File push created table", table_properties = {"filepush.table_volume_path_data": tablemanager.get_table_volume_path(table_name)} ) - if not tablemanager.has_data_file(table_name) and not tablemanager.is_table_created(table_name): - @dlt.append_flow(target = table_name) - def noop_to_table(): - return tablemanager.get_placeholder_stream(spark.readStream) - else: - @dlt.append_flow(target = table_name) - def append_to_table(): - reader = spark.readStream.format("cloudFiles") - reader = tablemanager.apply_table_config(reader, table_config) - return reader.load(table_volume_path) + @dlt.append_flow(target = table_name) + def append_to_table(): + reader = spark.readStream.format("cloudFiles") + reader = tablemanager.apply_table_config(reader, table_config) + if not tablemanager.has_data_file(table_name) and not tablemanager.is_table_created(table_name): + reader.schema("_rescued_data STRING") # Use _rescued_data as placeholder + return reader.load(table_volume_path) diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index d5790cac..7cf85a14 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -32,15 +32,7 @@ def has_data_file(table_name: str) -> bool: def is_table_created(table_name: str) -> bool: # Initialize workspace client ws = WorkspaceClient() - return ws.tables.exists(full_name=f"{envmanager.get_config()["catalog_name"]}.{envmanager.get_config()["schema_name"]}.{table_name}").table_exists - -def get_placeholder_stream(reader: DataStreamReader) -> DataStreamReader: - # Streaming source that produces empty micro-batches (but is STILL streaming) - return ( - reader.format("rate").option("rowsPerSecond", 1).load() - .selectExpr("CAST(NULL AS STRING) AS _rescued_data") - .where("1=0") # no rows, just preserves streaming lineage - ) + return ws.tables.exists(full_name=f"{envmanager.get_config()['catalog_name']}.{envmanager.get_config()['schema_name']}.{table_name}").table_exists def get_configs() -> list: json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") From d1f83736bf42977c7a4d06d05fc6f931f72ee554 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 17 Sep 2025 16:33:14 -0700 Subject: [PATCH 38/60] Fix a flow name conflict issue --- filepush/dab/src/configs/tables.json | 14 +++++++++++++- filepush/dab/src/ingestion.py | 5 ++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/filepush/dab/src/configs/tables.json b/filepush/dab/src/configs/tables.json index 81cb1560..aba5d80d 100644 --- a/filepush/dab/src/configs/tables.json +++ b/filepush/dab/src/configs/tables.json @@ -1,6 +1,18 @@ [ { - "name": "dummy", + "name": "dummy1", + "format": "csv" + }, + { + "name": "dummy2", + "format": "csv", + "format_options": { + "header": "true", + "escape": "\"" + } + }, + { + "name": "dummy3", "format": "csv", "format_options": { "header": "true", diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 622cc3d6..291a0ea0 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -13,7 +13,10 @@ comment = "File push created table", table_properties = {"filepush.table_volume_path_data": tablemanager.get_table_volume_path(table_name)} ) - @dlt.append_flow(target = table_name) + @dlt.append_flow( + target = table_name, + name = table_name + ) def append_to_table(): reader = spark.readStream.format("cloudFiles") reader = tablemanager.apply_table_config(reader, table_config) From 29b8609706890d89f689c75721f51e7a09594eae Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 18 Sep 2025 11:14:00 -0700 Subject: [PATCH 39/60] Working format manager --- filepush/dab/src/configs/tables.json | 2 - filepush/dab/src/debug_table_config.py | 10 ++-- filepush/dab/src/ingestion.py | 5 +- filepush/dab/src/utils/envmanager.py | 3 +- filepush/dab/src/utils/formatmanager.py | 71 +++++++++++++++++++++++++ filepush/dab/src/utils/tablemanager.py | 26 ++++----- 6 files changed, 92 insertions(+), 25 deletions(-) create mode 100644 filepush/dab/src/utils/formatmanager.py diff --git a/filepush/dab/src/configs/tables.json b/filepush/dab/src/configs/tables.json index aba5d80d..8bb40fa4 100644 --- a/filepush/dab/src/configs/tables.json +++ b/filepush/dab/src/configs/tables.json @@ -7,7 +7,6 @@ "name": "dummy2", "format": "csv", "format_options": { - "header": "true", "escape": "\"" } }, @@ -15,7 +14,6 @@ "name": "dummy3", "format": "csv", "format_options": { - "header": "true", "escape": "\"" }, "schema_hints": "id int, name string" diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 2e697a94..537b17ff 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -8,8 +8,8 @@ # MAGIC "name": "all_employees", # MAGIC "format": "csv", # MAGIC "format_options": { -# MAGIC "header": "true", -# MAGIC "escape": "\"" +# MAGIC "escape": "\"", +# MAGIC "multiLine": "false" # MAGIC } # MAGIC "schema_hints": "id int, name string" # MAGIC } @@ -20,10 +20,9 @@ table_config = r''' { - "name": "dummy", + "name": "dummy1", "format": "csv", "format_options": { - "header": "true", "escape": "\"" }, "schema_hints": "id int, name string" @@ -59,7 +58,6 @@ import tempfile with tempfile.TemporaryDirectory() as tmpdir: - reader = spark.readStream.format("cloudFiles") - reader = tablemanager.apply_table_config(reader, table_config_json) + reader = tablemanager.apply_table_config(spark.readStream, table_config_json) reader.option("cloudFiles.schemaLocation", tmpdir) display(reader.load(table_volume_path_data)) \ No newline at end of file diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 291a0ea0..bbc51abc 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -4,14 +4,13 @@ table_configs = tablemanager.get_configs() for table_config in table_configs: - tablemanager.validate_config(table_config) table_name = table_config['name'] table_volume_path = tablemanager.get_table_volume_path(table_name) dlt.create_streaming_table( name = table_name, comment = "File push created table", - table_properties = {"filepush.table_volume_path_data": tablemanager.get_table_volume_path(table_name)} + table_properties = {"filepush.table_volume_path_data": table_volume_path} ) @dlt.append_flow( target = table_name, @@ -20,6 +19,6 @@ def append_to_table(): reader = spark.readStream.format("cloudFiles") reader = tablemanager.apply_table_config(reader, table_config) - if not tablemanager.has_data_file(table_name) and not tablemanager.is_table_created(table_name): + if not tablemanager.has_data_file(table_name): reader.schema("_rescued_data STRING") # Use _rescued_data as placeholder return reader.load(table_volume_path) diff --git a/filepush/dab/src/utils/envmanager.py b/filepush/dab/src/utils/envmanager.py index 0eee314b..3f215c77 100644 --- a/filepush/dab/src/utils/envmanager.py +++ b/filepush/dab/src/utils/envmanager.py @@ -3,7 +3,8 @@ def get_config() -> dict: json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "environment.json") - assert os.path.exists(json_path), f"Missing environment file: {json_path}. Have you run `databricks bundle run configuration_job`?" + if not os.path.exists(json_path): + raise RuntimeError(f"Missing environment file: {json_path}. Have you run `databricks bundle run configuration_job`?") with open(json_path, "r") as f: configs = json.load(f) return configs diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py new file mode 100644 index 00000000..acf944e9 --- /dev/null +++ b/filepush/dab/src/utils/formatmanager.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass + +@dataclass(frozen=True, slots=True) +class AutoLoaderOption: + key: str + value: str + hidden: bool = False + def __iter__(self): + yield (self.key, self) + +class AutoLoaderFormat: + def __init__(self): + self.name = None + self.options: set[AutoLoaderOption] = { + AutoLoaderOption("cloudFiles.inferColumnTypes", "true", True), + AutoLoaderOption("cloudFiles.schemaEvolutionMode", "addNewColumns", True), + } + + def __iter__(self): + yield (self.name, self) + + def get_userfacing_options(self) -> dict[str, str]: + return {opt.key: opt.value for opt in self.options if not opt.hidden} + + def validate_user_options(self, options: dict[str, str]) -> None: + allowed = set(self.get_userfacing_options()) + illegal = set(options) - allowed + if illegal: + raise ValueError( + f"Unsupported or protected options: {sorted(illegal)}. " + f"Allowed user options: {sorted(allowed)}" + ) + + def get_modified_options(self, options: dict[str, str]) -> dict[str, str]: + self.validate_user_options(options) + defaults = self.get_userfacing_options() + return {k: v for k, v in options.items() if k in defaults and v != defaults[k]} + +class CSV(AutoLoaderFormat): + def __init__(self): + super().__init__() + self.name = "CSV" + self.options |= { + AutoLoaderOption("header", "true", True), + AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("delimiter", ","), + AutoLoaderOption("escape", "\""), + AutoLoaderOption("multiLine", "false"), + } + +class JSON(AutoLoaderFormat): + def __init__(self): + super().__init__() + self.name = "JSON" + self.options |= { + AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("allowComments", "true"), + AutoLoaderOption("allowSingleQuotes", "true"), + AutoLoaderOption("inferTimestamp", "true"), + AutoLoaderOption("multiLine", "true"), + } + +_supported_formats: dict[str, AutoLoaderFormat] = {f.name: f for f in (CSV(), JSON())} + +def get_format_manager(fmt: str) -> dict[str, str]: + key = fmt.strip().upper() + try: + return _supported_formats[key] + except KeyError: + supported = ", ".join(sorted(_supported_formats)) + raise ValueError(f"{fmt!r} is not a supported format. Supported formats: {supported}") diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 7cf85a14..6b5baa25 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -1,25 +1,21 @@ import os import json from . import envmanager +from . import formatmanager from pyspark.sql.streaming import DataStreamReader from databricks.sdk import WorkspaceClient from databricks.sdk.errors.platform import NotFound -def validate_config(config: dict): - pass - def get_table_volume_path(table_name: str) -> str: - # Initialize workspace client ws = WorkspaceClient() table_volume_path_data = os.path.join(envmanager.get_config()["volume_path_data"], table_name) try: ws.files.get_directory_metadata(table_volume_path_data) except NotFound: - assert False, f"Table data path not found for table `{table_name}`. Have you run `databricks bundle run configuration_job`?" + raise RuntimeError(f"Table data path not found for table `{table_name}`. Have you run `databricks bundle run configuration_job`?") return table_volume_path_data def has_data_file(table_name: str) -> bool: - # Initialize workspace client ws = WorkspaceClient() table_volume_path_data = get_table_volume_path(table_name) try: @@ -30,25 +26,29 @@ def has_data_file(table_name: str) -> bool: return True def is_table_created(table_name: str) -> bool: - # Initialize workspace client ws = WorkspaceClient() return ws.tables.exists(full_name=f"{envmanager.get_config()['catalog_name']}.{envmanager.get_config()['schema_name']}.{table_name}").table_exists def get_configs() -> list: json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") - assert os.path.exists(json_path), f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job." + if not os.path.exists(json_path): + raise RuntimeError(f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job.") with open(json_path, "r") as f: configs = json.load(f) return configs def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: fmt = table_config.get("format") - assert fmt is not None, f"format is required for table {table_config.get('name')}" - reader = reader.option("cloudFiles.format", fmt) + if fmt is None: + raise ValueError(f"format is required for table {table_config.get('name')}") + + # format-specific options from user input + user_fmt_opts = table_config.get("format_options", {}) + # validate and get the final modified options + final_fmt_opts = formatmanager.get_format_manager(fmt).get_modified_options(user_fmt_opts) - # format-specific options - fmt_opts = table_config.get("format_options", {}) - for k, v in fmt_opts.items(): + reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) + for k, v in final_fmt_opts.items(): reader = reader.option(k, v) # schema hints From 1caa333ef863a84d48d7bdf8c6f8480b2e8220c8 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 18 Sep 2025 11:38:51 -0700 Subject: [PATCH 40/60] Fix multi table bug --- filepush/dab/src/ingestion.py | 36 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index bbc51abc..69f2975a 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -1,24 +1,28 @@ import dlt from utils import tablemanager +def _make_append_flow(table_name, table_config, table_volume_path): + def _body(): + reader = tablemanager.apply_table_config(spark.readStream, table_config) + if not tablemanager.has_data_file(table_name): + reader = reader.schema("_rescued_data STRING") + return reader.load(table_volume_path) + + # give the function a unique name (nice for logs / debug) + _body.__name__ = f"append_{table_name.lower()}" + + # apply the decorator programmatically + return dlt.append_flow(target=table_name, name=table_name)(_body) + table_configs = tablemanager.get_configs() -for table_config in table_configs: - table_name = table_config['name'] - table_volume_path = tablemanager.get_table_volume_path(table_name) +for cfg in table_configs: + tbl = cfg["name"] + path = tablemanager.get_table_volume_path(tbl) dlt.create_streaming_table( - name = table_name, - comment = "File push created table", - table_properties = {"filepush.table_volume_path_data": table_volume_path} - ) - @dlt.append_flow( - target = table_name, - name = table_name + name=tbl, + comment="File push created table", + table_properties={"filepush.table_volume_path_data": path}, ) - def append_to_table(): - reader = spark.readStream.format("cloudFiles") - reader = tablemanager.apply_table_config(reader, table_config) - if not tablemanager.has_data_file(table_name): - reader.schema("_rescued_data STRING") # Use _rescued_data as placeholder - return reader.load(table_volume_path) + _make_append_flow(tbl, cfg, path) From 3e906e6b9382cca1c779c9a285a15a95bf0db305 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 18 Sep 2025 16:16:39 -0700 Subject: [PATCH 41/60] Fix option merge issue --- filepush/dab/src/configs/tables.json | 13 +------------ filepush/dab/src/debug_table_config.py | 2 +- filepush/dab/src/utils/formatmanager.py | 8 ++++++++ filepush/dab/src/utils/tablemanager.py | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/filepush/dab/src/configs/tables.json b/filepush/dab/src/configs/tables.json index 8bb40fa4..98c4591f 100644 --- a/filepush/dab/src/configs/tables.json +++ b/filepush/dab/src/configs/tables.json @@ -1,17 +1,6 @@ [ { - "name": "dummy1", - "format": "csv" - }, - { - "name": "dummy2", - "format": "csv", - "format_options": { - "escape": "\"" - } - }, - { - "name": "dummy3", + "name": "employees", "format": "csv", "format_options": { "escape": "\"" diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 537b17ff..a134b2af 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -20,7 +20,7 @@ table_config = r''' { - "name": "dummy1", + "name": "employees", "format": "csv", "format_options": { "escape": "\"" diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index acf944e9..302cb547 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -35,6 +35,14 @@ def get_modified_options(self, options: dict[str, str]) -> dict[str, str]: self.validate_user_options(options) defaults = self.get_userfacing_options() return {k: v for k, v in options.items() if k in defaults and v != defaults[k]} + + def get_merged_options(self, options: dict[str, str]) -> dict[str, str]: + self.validate_user_options(options) + defaults = self.get_userfacing_options() + + merged = defaults.copy() + merged.update({k: v for k, v in options.items() if k in defaults}) + return merged class CSV(AutoLoaderFormat): def __init__(self): diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 6b5baa25..82106fea 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -45,7 +45,7 @@ def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStre # format-specific options from user input user_fmt_opts = table_config.get("format_options", {}) # validate and get the final modified options - final_fmt_opts = formatmanager.get_format_manager(fmt).get_modified_options(user_fmt_opts) + final_fmt_opts = formatmanager.get_format_manager(fmt).get_merged_options(user_fmt_opts) reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) for k, v in final_fmt_opts.items(): From 1c5ba0f35ef79bfdc00e2726dc5ebe9d8afa4a7e Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Thu, 18 Sep 2025 16:28:14 -0700 Subject: [PATCH 42/60] Add corrupted record columm to CSV and JSON option --- filepush/dab/src/utils/formatmanager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index 302cb547..29aa0a68 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -51,6 +51,7 @@ def __init__(self): self.options |= { AutoLoaderOption("header", "true", True), AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("columnNameOfCorruptRecord", "_corrupt_record", True), AutoLoaderOption("delimiter", ","), AutoLoaderOption("escape", "\""), AutoLoaderOption("multiLine", "false"), @@ -62,6 +63,7 @@ def __init__(self): self.name = "JSON" self.options |= { AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("columnNameOfCorruptRecord", "_corrupt_record", True), AutoLoaderOption("allowComments", "true"), AutoLoaderOption("allowSingleQuotes", "true"), AutoLoaderOption("inferTimestamp", "true"), From 11540ce9216fe5cb45e6866e77e36f14d58d93a2 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 19 Sep 2025 16:23:32 -0700 Subject: [PATCH 43/60] Refactor and add expectation --- filepush/dab/src/debug_table_config.py | 29 ++++++------------- filepush/dab/src/ingestion.py | 5 ++++ filepush/dab/src/utils/formatmanager.py | 9 ++++++ filepush/dab/src/utils/tablemanager.py | 38 ++++++++++++++++--------- 4 files changed, 47 insertions(+), 34 deletions(-) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index a134b2af..f0ff40bb 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -32,32 +32,19 @@ # COMMAND ---------- import json -import os -from utils import envmanager +import tempfile from utils import tablemanager -# Load configs from environment json -config = envmanager.get_config() -catalog_name = config["catalog_name"] -schema_name = config["schema_name"] - -# Load table configs +# Load table config table_config_json = json.loads(table_config) +tablemanager.validate_config(table_config_json) table_name = table_config_json["name"] -assert table_name, "Please provide a table name in the table_config json" -table_configs = tablemanager.get_configs() -matches = [table_config for table_config in table_configs if table_config.get("name") == table_name] -assert len(matches) == 1, f"Expect exactly 1 config for table `{table_name}`. Found {len(matches)}. Please fix the config file and run configuration_job" table_volume_path_data = tablemanager.get_table_volume_path(table_name) -assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file to {table_volume_path_data}" - -print(f"Table Volume Path: {table_volume_path_data}") +table_reader = tablemanager.apply_table_config(spark.readStream, table_config_json) -# COMMAND ---------- - -import tempfile +assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file to {table_volume_path_data}" +# Put schema location in temp directory with tempfile.TemporaryDirectory() as tmpdir: - reader = tablemanager.apply_table_config(spark.readStream, table_config_json) - reader.option("cloudFiles.schemaLocation", tmpdir) - display(reader.load(table_volume_path_data)) \ No newline at end of file + table_reader.option("cloudFiles.schemaLocation", tmpdir) + display(table_reader.load(table_volume_path_data)) \ No newline at end of file diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 69f2975a..264484a2 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -1,9 +1,11 @@ import dlt from utils import tablemanager +from utils import formatmanager def _make_append_flow(table_name, table_config, table_volume_path): def _body(): reader = tablemanager.apply_table_config(spark.readStream, table_config) + # use _rescued_data as placeholder when no data file is present if not tablemanager.has_data_file(table_name): reader = reader.schema("_rescued_data STRING") return reader.load(table_volume_path) @@ -17,12 +19,15 @@ def _body(): table_configs = tablemanager.get_configs() for cfg in table_configs: + tablemanager.validate_config(cfg) tbl = cfg["name"] path = tablemanager.get_table_volume_path(tbl) + expts = formatmanager.get_format_manager(cfg["format"]).expectations dlt.create_streaming_table( name=tbl, comment="File push created table", table_properties={"filepush.table_volume_path_data": path}, ) + dlt.expect_all(expts) _make_append_flow(tbl, cfg, path) diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index 29aa0a68..ccc9d462 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -15,6 +15,9 @@ def __init__(self): AutoLoaderOption("cloudFiles.inferColumnTypes", "true", True), AutoLoaderOption("cloudFiles.schemaEvolutionMode", "addNewColumns", True), } + self.expectations: dict[str, str] = { + "Rescued data should be null": "_rescued_data IS NULL" + } def __iter__(self): yield (self.name, self) @@ -56,6 +59,9 @@ def __init__(self): AutoLoaderOption("escape", "\""), AutoLoaderOption("multiLine", "false"), } + self.expectations |= { + "Corrupted record should be null": "_corrupt_record IS NULL" + } class JSON(AutoLoaderFormat): def __init__(self): @@ -69,6 +75,9 @@ def __init__(self): AutoLoaderOption("inferTimestamp", "true"), AutoLoaderOption("multiLine", "true"), } + self.expectations |= { + "Corrupted record should be null": "_corrupt_record IS NULL" + } _supported_formats: dict[str, AutoLoaderFormat] = {f.name: f for f in (CSV(), JSON())} diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 82106fea..6845e39a 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -6,6 +6,29 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors.platform import NotFound +def validate_config(table_config: dict): + if not table_config.get("name"): + raise ValueError("name is required for table config") + if not table_config.get("format"): + raise ValueError("format is required for table config") + +def validate_configs(table_configs: list): + names = [cfg.get("name") for cfg in table_configs] + duplicates = set([name for name in names if names.count(name) > 1 and name is not None]) + if duplicates: + raise ValueError(f"Duplicate table names found in table configs: {sorted(duplicates)}") + for table_config in table_configs: + validate_config(table_config) + +def get_configs() -> list: + json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") + if not os.path.exists(json_path): + raise RuntimeError(f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job.") + with open(json_path, "r") as f: + configs = json.load(f) + validate_configs(configs) + return configs + def get_table_volume_path(table_name: str) -> str: ws = WorkspaceClient() table_volume_path_data = os.path.join(envmanager.get_config()["volume_path_data"], table_name) @@ -29,24 +52,13 @@ def is_table_created(table_name: str) -> bool: ws = WorkspaceClient() return ws.tables.exists(full_name=f"{envmanager.get_config()['catalog_name']}.{envmanager.get_config()['schema_name']}.{table_name}").table_exists -def get_configs() -> list: - json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "tables.json") - if not os.path.exists(json_path): - raise RuntimeError(f"Missing table configs file: {json_path}. Please following README.md to create one, deploy and run configuration_job.") - with open(json_path, "r") as f: - configs = json.load(f) - return configs - def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: + validate_config(table_config) fmt = table_config.get("format") - if fmt is None: - raise ValueError(f"format is required for table {table_config.get('name')}") - # format-specific options from user input + # format options user_fmt_opts = table_config.get("format_options", {}) - # validate and get the final modified options final_fmt_opts = formatmanager.get_format_manager(fmt).get_merged_options(user_fmt_opts) - reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) for k, v in final_fmt_opts.items(): reader = reader.option(k, v) From b0d27c8e085c49c3f9e5254a0e201d82f84c2352 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 19 Sep 2025 16:43:28 -0700 Subject: [PATCH 44/60] Add warning for default storage --- filepush/dab/src/debug_table_config.py | 4 ++++ filepush/dab/src/utils/envmanager.py | 27 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index f0ff40bb..31e64754 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -34,6 +34,10 @@ import json import tempfile from utils import tablemanager +from utils import envmanager + +if not envmanager.has_default_storage(): + print("WARNING: Current catalog is not using default storage, some file push feature may not be available") # Load table config table_config_json = json.loads(table_config) diff --git a/filepush/dab/src/utils/envmanager.py b/filepush/dab/src/utils/envmanager.py index 3f215c77..82c17f45 100644 --- a/filepush/dab/src/utils/envmanager.py +++ b/filepush/dab/src/utils/envmanager.py @@ -1,5 +1,6 @@ import os import json +from databricks.sdk import WorkspaceClient def get_config() -> dict: json_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "configs", "environment.json") @@ -8,3 +9,29 @@ def get_config() -> dict: with open(json_path, "r") as f: configs = json.load(f) return configs + +def has_default_storage() -> bool: + catalog = get_config()["catalog_name"] + + w = WorkspaceClient() + + # Try SDK model first + info = w.catalogs.get(catalog) + storage_root = getattr(info, "storage_root", None) + storage_location = getattr(info, "storage_location", None) + props = getattr(info, "properties", {}) or {} + + # Some workspaces expose fields only via raw JSON; fall back if all empty + if not (storage_root or storage_location or props): + j = w.api_client.do("GET", f"/api/2.1/unity-catalog/catalogs/{catalog}") + storage_root = j.get("storage_root") or j.get("storageLocation") + storage_location = j.get("storage_location") or j.get("storageLocation") + props = j.get("properties", {}) or {} + + # Heuristics: any of these indicates “default storage” is set + return bool( + storage_root or + storage_location or + props.get("defaultManagedLocation") or + props.get("delta.defaultLocation") + ) \ No newline at end of file From b0beb24988dd763e57604d40e9a537247bd1b253 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Fri, 19 Sep 2025 16:47:56 -0700 Subject: [PATCH 45/60] Clean up workspace --- filepush/.gitignore | 3 - filepush/create_filepush_schema.sh | 18 ---- filepush/drop_filepush_schema.sh | 22 ----- .../databricks_template_schema.json | 17 ---- .../filepush-template/library/variables.tmpl | 23 ------ .../template/__preamble.tmpl | 5 -- .../{{.schema_name}}/databricks.yml.tmpl | 28 ------- .../{{.schema_name}}/resources/job.yml.tmpl | 15 ---- .../resources/pipeline.yml.tmpl | 14 ---- .../resources/schema.yml.tmpl | 9 -- .../resources/volume.yml.tmpl | 8 -- .../src/pipelines/ingestion.py.tmpl | 82 ------------------- ...{{.schema_name}}_readfiles_kernel.sql.tmpl | 15 ---- .../{{.schema_name}}/tools/env.sh.tmpl | 42 ---------- .../get_volume_path_from_pipeline_config.sh | 7 -- .../get_volume_path_from_schema_dbproperty.sh | 7 -- .../get_volume_path_from_table_property.sh | 7 -- .../tools/open_all_resources.sh | 10 --- .../set_volume_path_to_schema_dbproperty.sh | 7 -- .../{{.schema_name}}/tools/trigger_refresh.sh | 7 -- .../tools/upload_to_volume.sh | 12 --- filepush/push_file_to_table.sh | 15 ---- 22 files changed, 373 deletions(-) delete mode 100755 filepush/create_filepush_schema.sh delete mode 100755 filepush/drop_filepush_schema.sh delete mode 100644 filepush/filepush-template/databricks_template_schema.json delete mode 100644 filepush/filepush-template/library/variables.tmpl delete mode 100644 filepush/filepush-template/template/__preamble.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl delete mode 100644 filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh delete mode 100755 filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh delete mode 100755 filepush/push_file_to_table.sh diff --git a/filepush/.gitignore b/filepush/.gitignore index 0e53a123..722d5e71 100644 --- a/filepush/.gitignore +++ b/filepush/.gitignore @@ -1,4 +1 @@ .vscode -up.sh -down.sh -conf.json diff --git a/filepush/create_filepush_schema.sh b/filepush/create_filepush_schema.sh deleted file mode 100755 index 3815470d..00000000 --- a/filepush/create_filepush_schema.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) " -} -if [ -z "$1" ] || [ -z "$2" ]; then - usage - exit 1 -fi -if ! databricks catalogs get "$1" >/dev/null 2>&1; then - echo "Catalog \`$1\` not found (or no permission)" - exit 1 -fi -databricks bundle init filepush-template --config-file <(echo "{\"catalog_name\": \"$1\", \"schema_name\": \"$2\"}") -working_dir=$(pwd) -schema_name=$2 -cd $schema_name -databricks bundle deploy --force-lock --auto-approve -t prod -cd $working_dir \ No newline at end of file diff --git a/filepush/drop_filepush_schema.sh b/filepush/drop_filepush_schema.sh deleted file mode 100755 index adbd2521..00000000 --- a/filepush/drop_filepush_schema.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) " -} -if [ -z "$1" ] || [ -z "$2" ]; then - usage - exit 1 -fi -if ! databricks catalogs get "$1" >/dev/null 2>&1; then - echo "Catalog \`$1\` not found (or no permission)" - exit 1 -fi -volume_path=$(databricks schemas get $1.$2 --output json | jq -r '.properties["filepush.volume_path"]') -if [ -z "$volume_path" ] || [ "$volume_path" == "null" ]; then - echo "Schema \`$1.$2\` is not a filepush schema. Did you run create_filepush_schema.sh to create it?" - exit 1 -fi -working_dir=$(pwd) -schema_name=$2 -cd $schema_name -databricks bundle destroy --force-lock -t prod -cd $working_dir \ No newline at end of file diff --git a/filepush/filepush-template/databricks_template_schema.json b/filepush/filepush-template/databricks_template_schema.json deleted file mode 100644 index f150630b..00000000 --- a/filepush/filepush-template/databricks_template_schema.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "properties": { - "catalog_name": { - "type": "string", - "default": "{{default_catalog}}", - "description": "Name of the catalog where tables and pipelines will be created.", - "order": 1 - }, - "schema_name": { - "type": "string", - "default": "default", - "description": "Name of the schema where tables and pipelines will be created.", - "order": 2 - } - }, - "success_message": "\nYour file push bundle under catalog and schema {{.catalog_name}}.{{.schema_name}} has been created." -} diff --git a/filepush/filepush-template/library/variables.tmpl b/filepush/filepush-template/library/variables.tmpl deleted file mode 100644 index 34b8f79e..00000000 --- a/filepush/filepush-template/library/variables.tmpl +++ /dev/null @@ -1,23 +0,0 @@ -{{ define `volume_path` -}} - /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.schema_name}}_volume/ -{{- end }} - -{{ define `volume_data_path` -}} - dbfs:{{template `volume_path` .}}data/ -{{- end }} - -{{ define `volume_baddata_path` -}} - dbfs:{{template `volume_path` .}}baddata/ -{{- end }} - -{{ define `volume_archive_path` -}} - dbfs:{{template `volume_path` .}}archive/ -{{- end }} - -{{ define `volume_path_url` -}} - dbfs:{{template `volume_path` .}} -{{- end }} - -{{ define `raw_table_name_format` -}} - {{.connector_name}}_raw -{{- end}} diff --git a/filepush/filepush-template/template/__preamble.tmpl b/filepush/filepush-template/template/__preamble.tmpl deleted file mode 100644 index b538c75a..00000000 --- a/filepush/filepush-template/template/__preamble.tmpl +++ /dev/null @@ -1,5 +0,0 @@ -# Preamble - -This file only template directives; it is skipped for the actual output. - -{{skip "__preamble"}} diff --git a/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl deleted file mode 100644 index d32c1802..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/databricks.yml.tmpl +++ /dev/null @@ -1,28 +0,0 @@ -# databricks.yml -# This is the configuration for the file push DAB {{.schema_name}}. - -bundle: - name: {{.schema_name}} - -include: - - resources/*.yml - -experimental: - skip_name_prefix_for_schema: true - -targets: - # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html - dev: - mode: development - default: true - workspace: - host: {{workspace_host}} - - prod: - mode: production - workspace: - host: {{workspace_host}} - root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} - permissions: - - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} - level: CAN_MANAGE diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl deleted file mode 100644 index 97d09b12..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/job.yml.tmpl +++ /dev/null @@ -1,15 +0,0 @@ -# The main job for schema {{.schema_name}} -# This job will trigger in the schema pipeline - -resources: - jobs: - {{.schema_name}}_job: - name: {{.schema_name}}_job - tasks: - - task_key: {{.schema_name}}_pipeline_refresh - pipeline_task: - pipeline_id: ${resources.pipelines.{{.schema_name}}_pipeline.id} - trigger: - file_arrival: - url: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ - pause_status: UNPAUSED diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl deleted file mode 100644 index 21b124be..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/pipeline.yml.tmpl +++ /dev/null @@ -1,14 +0,0 @@ -# The table refresh pipeline for schema {{.schema_name}} - -resources: - pipelines: - {{.schema_name}}_pipeline: - name: {{.schema_name}}_pipeline - catalog: {{.catalog_name}} - schema: {{.schema_name}} - serverless: true - libraries: - - file: - path: ../src/pipelines/ingestion.py - configuration: - filepush.volume_path: /Volumes/{{.catalog_name}}/${resources.schemas.{{.schema_name}}.name}/{{.schema_name}}_volume/ diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl deleted file mode 100644 index 032b7b9d..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/schema.yml.tmpl +++ /dev/null @@ -1,9 +0,0 @@ -# The schema {{.schema_name}} - -resources: - schemas: - {{.schema_name}}: - name: {{.schema_name}} - catalog_name: {{.catalog_name}} - properties: - filepush.volume_path: /Volumes/{{.catalog_name}}/{{.schema_name}}/{{.schema_name}}_volume/ \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl b/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl deleted file mode 100644 index 95904249..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/resources/volume.yml.tmpl +++ /dev/null @@ -1,8 +0,0 @@ -# The file staging volume for schema {{.schema_name}} - -resources: - volumes: - {{.schema_name}}_volume: - name: {{.schema_name}}_volume - catalog_name: {{.catalog_name}} - schema_name: ${resources.schemas.{{.schema_name}}.name} diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl deleted file mode 100644 index 9c837652..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/ingestion.py.tmpl +++ /dev/null @@ -1,82 +0,0 @@ -import dlt -from dbruntime.dbutils import FileInfo -import re -import os - -# Dynamic Tables -def sanitize_table_name(name: str) -> str: - """ - Sanitize a table name. - - Lowercase - - Replace non [a-z0-9_] with underscores - - Ensure it doesn't start with a digit - """ - n = name.strip().lower() - n = re.sub(r"[^a-z0-9_]", "_", n) - if re.match(r"^[0-9]", n): - n = f"t_{n}" - n = re.sub(r"_+", "_", n).strip("_") - return n or "t_unnamed" - -def is_valid_table_name(name: str) -> bool: - """ - Validate a table name. - - Must be alphanumeric - - Must not start with a digit - - Must not contain any special characters - """ - pat = re.compile(r'^[A-Za-z0-9_]+$') - return pat.match(name) is not None - -def dbfs_is_dir(f: FileInfo): - is_dir_attr = getattr(f, "isDir", None) - return is_dir_attr() if callable(is_dir_attr) else f.name.endswith("/") - -def list_immediate_subdirs(path: str): - items = dbutils.fs.ls(path) - out = [] - for f in items: - if dbfs_is_dir(f): - # f.name often ends with '/', drop it for a clean folder name - clean_name = f.name[:-1] if f.name.endswith("/") else f.name - if is_valid_table_name(clean_name): - out.append((clean_name, f.path.removeprefix('dbfs:'))) - else: - print(f"Skipping invalid table name: {clean_name}. It must be alphanumeric connected by underscores and not start with a digit.") - return out - -def make_dlt_table(subdir_name: str, subdir_path: str): - """ - Defines a DLT table for a given subfolder at import time. - If table does not exist, it will create a read_files kernel and use that to create the table. - """ - table_name = sanitize_table_name(subdir_name) - kernel_file_name = f"./{{.schema_name}}_{table_name}_readfiles_kernel.sql" - - if not os.path.exists(kernel_file_name): - print(f"Initialize table {table_name}") - with open(f"./{{.schema_name}}_readfiles_kernel.sql", "r") as f: - kernel_query_fmt = f.read() - with open(kernel_file_name, "w") as f: - table_kernel_query = kernel_query_fmt % subdir_path - f.write(table_kernel_query) - - if len(dbutils.fs.ls(subdir_path)) > 0: - @dlt.table( - name=table_name, - comment=f"Auto-created from subfolder: {subdir_path} (streaming via Auto Loader)", - table_properties={ - "filepush.volume_path": f"{subdir_path}" - } - ) - def _auto_loader_table(): - with open(kernel_file_name, "r") as f: - table_kernel_query = f.read() - print(table_kernel_query.replace("read_files(", "STREAM read_files(")) - return spark.sql(table_kernel_query.replace("read_files(", "STREAM read_files(")) - else: - print(f"Waiting for files to land in {subdir_path}") - -volume_path_root = spark.conf.get("filepush.volume_path") -for subdir_name, subdir_path in list_immediate_subdirs(volume_path_root): - make_dlt_table(subdir_name, subdir_path) \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl b/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl deleted file mode 100644 index 55666737..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/src/pipelines/{{.schema_name}}_readfiles_kernel.sql.tmpl +++ /dev/null @@ -1,15 +0,0 @@ --- Kernel template for read_files -SELECT - * -FROM - read_files( - '%s', - ignoreCorruptFiles => 'true', -- If a different malformed file pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. - ignoreMissingFiles => 'true' -- If a different file format is accidentally pushed and RocksDB has it enlisted, this will ignore the error when the file is deleted. - -- Do not change anything above - -- Add any additional options below - -- Example: - -- , - -- header => 'true', - -- escape => '"' - ) diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl b/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl deleted file mode 100644 index a5a64d7c..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/env.sh.tmpl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -# This file is used to set the environment variables for the filepush bundle. -# It is sourced by the other scripts in the tools directory. -# This should be deployed **after** the bundle is deployed. - -# Prevent running directly; this file must be *sourced* -(return 0 2>/dev/null) || { echo "Source this file: . $(basename "$0")"; exit 1; } - -# Idempotent guard -# Check if the environment is already set and non-empty -if [[ -n "${_FILEPUSH_ENV_LOADED:-}" ]]; then - return 0 -fi -export _FILEPUSH_ENV_LOADED=1 - -# Sets the target for the bundle -ARG_TARGET="dev" -ARG_POSITIONAL=() - -while [[ $# -gt 0 ]]; do -case "$1" in - --target) [[ $# -ge 2 ]] || { echo "Error: --target needs a value"; return 2; } - ARG_TARGET="$2"; shift 2 ;; - --target=*) ARG_TARGET="${1#*=}"; shift ;; - -t) [[ $# -ge 2 ]] || { echo "Error: -t needs a value"; return 2; } - ARG_TARGET="$2"; shift 2 ;; - --) shift; ARG_POSITIONAL+=("$@"); break ;; - -h|--help) usage; return 1 ;; - -*) echo "Unknown option: $1"; usage; return 2 ;; - *) ARG_POSITIONAL+=("$1"); shift ;; -esac -done - -export BUNDLE_TARGET=$ARG_TARGET - -summary=$(databricks bundle summary -t $BUNDLE_TARGET --output json) -export FILEPUSH_BUNDLE_NAME={{.schema_name}} -export FILEPUSH_CATALOG_NAME={{.catalog_name}} -export FILEPUSH_SCHEMA_NAME=$(echo $summary | jq -r '.resources.schemas.{{.schema_name}}.name') -export FILEPUSH_VOLUME_PATH=/Volumes/{{.catalog_name}}/${FILEPUSH_SCHEMA_NAME}/{{.schema_name}}_volume/ -export FILEPUSH_PIPELINE_ID=$(echo $summary | jq -r '.resources.pipelines.{{.schema_name}}_pipeline.id') -export FILEPUSH_JOB_NAME={{.schema_name}}_job diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh deleted file mode 100755 index 4fef63c6..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_pipeline_config.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks pipelines get $FILEPUSH_PIPELINE_ID -t $BUNDLE_TARGET --output json | jq '.spec.configuration["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh deleted file mode 100755 index 14da35c1..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_schema_dbproperty.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks schemas get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} -t $BUNDLE_TARGET --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh deleted file mode 100755 index fb93f468..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/get_volume_path_from_table_property.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks tables get ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME}.$1 -t $BUNDLE_TARGET --output json | jq '.properties["filepush.volume_path"]' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh deleted file mode 100755 index 6abf8173..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/open_all_resources.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks bundle open ${FILEPUSH_BUNDLE_NAME} -t $BUNDLE_TARGET -databricks bundle open ${FILEPUSH_BUNDLE_NAME}_job -t $BUNDLE_TARGET -databricks bundle open ${FILEPUSH_BUNDLE_NAME}_pipeline -t $BUNDLE_TARGET -databricks bundle open ${FILEPUSH_BUNDLE_NAME}_volume -t $BUNDLE_TARGET \ No newline at end of file diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh deleted file mode 100755 index 3169f1c0..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/set_volume_path_to_schema_dbproperty.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks schemas update ${FILEPUSH_CATALOG_NAME}.${FILEPUSH_SCHEMA_NAME} -t $BUNDLE_TARGET --json '{ "properties": { "filepush.volume_path": "'${FILEPUSH_VOLUME_PATH}'" } }' diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh deleted file mode 100755 index f85c34e1..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/trigger_refresh.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -databricks bundle run $FILEPUSH_JOB_NAME -t $BUNDLE_TARGET diff --git a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh b/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh deleted file mode 100755 index 31e5fbde..00000000 --- a/filepush/filepush-template/template/{{.schema_name}}/tools/upload_to_volume.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) [--target=dev|prod]" -} -export -f usage -. $(dirname $0)/env.sh $@ -if [ -z "$1" ] || [ -z "$2" ]; then - usage - exit 1 -fi -databricks fs mkdir dbfs:${FILEPUSH_VOLUME_PATH}$1/ -t $BUNDLE_TARGET -databricks fs cp $2 dbfs:${FILEPUSH_VOLUME_PATH}$1/ -t $BUNDLE_TARGET diff --git a/filepush/push_file_to_table.sh b/filepush/push_file_to_table.sh deleted file mode 100755 index e652b6b8..00000000 --- a/filepush/push_file_to_table.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash -usage() { - echo "Usage: $(basename $0) " -} -if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ] || [ -z "$4" ]; then - usage - exit 1 -fi -volume_path=$(databricks schemas get $1.$2 --output json | jq -r '.properties["filepush.volume_path"]') -if [ -z "$volume_path" ] || [ "$volume_path" == "null" ]; then - echo "Schema \`$1.$2\` is not a filepush schema. Did you run create_filepush_schema.sh to create it?" - exit 1 -fi -databricks fs mkdir dbfs:${volume_path}$3/ -databricks fs cp $4 dbfs:${volume_path}$3/ \ No newline at end of file From 36bad25fb30b294ac830ac3725f0b17f4521fffd Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 22 Sep 2025 11:07:15 -0700 Subject: [PATCH 46/60] Create cleansource move folder --- filepush/dab/src/utils/initialization.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/filepush/dab/src/utils/initialization.py b/filepush/dab/src/utils/initialization.py index 09f79947..211d26d8 100644 --- a/filepush/dab/src/utils/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -15,6 +15,7 @@ schema_name = args.schema_name volume_path_root = args.volume_path_root volume_path_data = args.volume_path_root + "/data" +volume_path_archive = args.volume_path_root + "/archive" logging_level = logging.DEBUG if args.logging_level == "dev" else logging.INFO # Logging @@ -33,7 +34,8 @@ logger.debug(f"Volume path data: {volume_path_data}") ws.schemas.update(full_name=f"{catalog_name}.{schema_name}", properties={ "filepush.volume_path_root": volume_path_root, - "filepush.volume_path_data": volume_path_data + "filepush.volume_path_data": volume_path_data, + "filepush.volume_path_data": volume_path_archive }) logger.info(f"Schema {catalog_name}.{schema_name} configured") @@ -41,11 +43,16 @@ logger.info(f"Initializing volume folder structure {volume_path_root}") logger.debug(f"Creating data directory {volume_path_data}") ws.files.create_directory(volume_path_data) +logger.debug(f"Creating archive directory {volume_path_archive}") +ws.files.create_directory(volume_path_archive) with open("../configs/tables.json", "r") as f: for table in json.load(f): table_volume_path_data = f"{volume_path_data}/{table['name']}" logger.debug(f"Creating table directory {table_volume_path_data}") ws.files.create_directory(table_volume_path_data) + table_volume_path_archive = f"{volume_path_archive}/{table['name']}" + logger.debug(f"Creating table archive directory {table_volume_path_archive}") + ws.files.create_directory(table_volume_path_archive) logger.info(f"Volume {volume_path_root} configured") # Dump configs to environment json @@ -54,5 +61,6 @@ "catalog_name": catalog_name, "schema_name": schema_name, "volume_path_root": volume_path_root, - "volume_path_data": volume_path_data + "volume_path_data": volume_path_data, + "volume_path_archive": volume_path_archive }, f) From b88869be2c776b25b391e58176780a2e9371dc86 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 22 Sep 2025 11:28:50 -0700 Subject: [PATCH 47/60] Include cleansource move destination --- filepush/dab/src/utils/formatmanager.py | 12 +++++++++++- filepush/dab/src/utils/tablemanager.py | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index ccc9d462..ce496121 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from . import envmanager @dataclass(frozen=True, slots=True) class AutoLoaderOption: @@ -14,6 +15,9 @@ def __init__(self): self.options: set[AutoLoaderOption] = { AutoLoaderOption("cloudFiles.inferColumnTypes", "true", True), AutoLoaderOption("cloudFiles.schemaEvolutionMode", "addNewColumns", True), + AutoLoaderOption("cloudFiles.cleanSource", "MOVE", True), + AutoLoaderOption("cloudFiles.cleanSource.retentionDuration", "14 days", True), + AutoLoaderOption("cloudFiles.cleanSource.moveDestination", f"{envmanager.get_config()['volume_path_archive']}/{{table_name}}", True) } self.expectations: dict[str, str] = { "Rescued data should be null": "_rescued_data IS NULL" @@ -39,12 +43,18 @@ def get_modified_options(self, options: dict[str, str]) -> dict[str, str]: defaults = self.get_userfacing_options() return {k: v for k, v in options.items() if k in defaults and v != defaults[k]} - def get_merged_options(self, options: dict[str, str]) -> dict[str, str]: + def get_merged_options(self, options: dict[str, str], table_name: str) -> dict[str, str]: self.validate_user_options(options) defaults = self.get_userfacing_options() merged = defaults.copy() merged.update({k: v for k, v in options.items() if k in defaults}) + + # Format the moveDestination if table_name is supplied + move_dest_key = "cloudFiles.cleanSource.moveDestination" + if table_name is not None and move_dest_key in merged: + merged[move_dest_key] = merged[move_dest_key].format(table_name=table_name) + return merged class CSV(AutoLoaderFormat): diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 6845e39a..4d4095bd 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -54,11 +54,12 @@ def is_table_created(table_name: str) -> bool: def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: validate_config(table_config) + name = table_config.get("name") fmt = table_config.get("format") # format options user_fmt_opts = table_config.get("format_options", {}) - final_fmt_opts = formatmanager.get_format_manager(fmt).get_merged_options(user_fmt_opts) + final_fmt_opts = formatmanager.get_format_manager(fmt).get_merged_options(user_fmt_opts, name) reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) for k, v in final_fmt_opts.items(): reader = reader.option(k, v) From 7a89c08e9b3531123e7afb53fa86e51a13b51596 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 22 Sep 2025 15:42:59 -0700 Subject: [PATCH 48/60] Fix issue on _corrupted_record column --- filepush/dab/src/ingestion.py | 13 +++++++------ filepush/dab/src/utils/envmanager.py | 3 ++- filepush/dab/src/utils/formatmanager.py | 5 +++++ filepush/dab/src/utils/tablemanager.py | 7 +++++-- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index 264484a2..d6c3b996 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -2,19 +2,19 @@ from utils import tablemanager from utils import formatmanager -def _make_append_flow(table_name, table_config, table_volume_path): +def _make_append_flow(table_name, table_config, table_volume_path, format_mgr): def _body(): reader = tablemanager.apply_table_config(spark.readStream, table_config) # use _rescued_data as placeholder when no data file is present if not tablemanager.has_data_file(table_name): - reader = reader.schema("_rescued_data STRING") + reader = reader.schema(",".join(format_mgr.default_schema)) return reader.load(table_volume_path) # give the function a unique name (nice for logs / debug) _body.__name__ = f"append_{table_name.lower()}" # apply the decorator programmatically - return dlt.append_flow(target=table_name, name=table_name)(_body) + dlt.append_flow(target=table_name, name=table_name)(_body) table_configs = tablemanager.get_configs() @@ -22,12 +22,13 @@ def _body(): tablemanager.validate_config(cfg) tbl = cfg["name"] path = tablemanager.get_table_volume_path(tbl) - expts = formatmanager.get_format_manager(cfg["format"]).expectations + fmt = formatmanager.get_format_manager(cfg["format"]) + expts = fmt.expectations dlt.create_streaming_table( name=tbl, comment="File push created table", table_properties={"filepush.table_volume_path_data": path}, + expect_all=expts ) - dlt.expect_all(expts) - _make_append_flow(tbl, cfg, path) + _make_append_flow(tbl, cfg, path, fmt) diff --git a/filepush/dab/src/utils/envmanager.py b/filepush/dab/src/utils/envmanager.py index 82c17f45..ba822a98 100644 --- a/filepush/dab/src/utils/envmanager.py +++ b/filepush/dab/src/utils/envmanager.py @@ -34,4 +34,5 @@ def has_default_storage() -> bool: storage_location or props.get("defaultManagedLocation") or props.get("delta.defaultLocation") - ) \ No newline at end of file + ) + \ No newline at end of file diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index ce496121..5f61ca7c 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -22,6 +22,7 @@ def __init__(self): self.expectations: dict[str, str] = { "Rescued data should be null": "_rescued_data IS NULL" } + self.default_schema: set[str] = {"_rescued_data STRING"} def __iter__(self): yield (self.name, self) @@ -64,6 +65,7 @@ def __init__(self): self.options |= { AutoLoaderOption("header", "true", True), AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("mode", "PERMISSIVE", True), AutoLoaderOption("columnNameOfCorruptRecord", "_corrupt_record", True), AutoLoaderOption("delimiter", ","), AutoLoaderOption("escape", "\""), @@ -72,6 +74,7 @@ def __init__(self): self.expectations |= { "Corrupted record should be null": "_corrupt_record IS NULL" } + self.default_schema |= {"_corrupt_record STRING"} class JSON(AutoLoaderFormat): def __init__(self): @@ -79,6 +82,7 @@ def __init__(self): self.name = "JSON" self.options |= { AutoLoaderOption("mergeSchema", "true", True), + AutoLoaderOption("mode", "PERMISSIVE", True), AutoLoaderOption("columnNameOfCorruptRecord", "_corrupt_record", True), AutoLoaderOption("allowComments", "true"), AutoLoaderOption("allowSingleQuotes", "true"), @@ -88,6 +92,7 @@ def __init__(self): self.expectations |= { "Corrupted record should be null": "_corrupt_record IS NULL" } + self.default_schema |= {"_corrupt_record STRING"} _supported_formats: dict[str, AutoLoaderFormat] = {f.name: f for f in (CSV(), JSON())} diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 4d4095bd..83c8a97c 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -56,10 +56,11 @@ def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStre validate_config(table_config) name = table_config.get("name") fmt = table_config.get("format") + fmt_mgr = formatmanager.get_format_manager(fmt) # format options user_fmt_opts = table_config.get("format_options", {}) - final_fmt_opts = formatmanager.get_format_manager(fmt).get_merged_options(user_fmt_opts, name) + final_fmt_opts = fmt_mgr.get_merged_options(user_fmt_opts, name) reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) for k, v in final_fmt_opts.items(): reader = reader.option(k, v) @@ -67,7 +68,9 @@ def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStre # schema hints schema_hints = table_config.get("schema_hints") if schema_hints: - reader = reader.option("cloudFiles.schemaHints", schema_hints) + reader = reader.option("cloudFiles.schemaHints", ",".join({schema_hints} | fmt_mgr.default_schema)) + else: + reader = reader.option("cloudFiles.schemaHints", ",".join(fmt_mgr.default_schema)) return reader \ No newline at end of file From c26bf421a8f54b12a3cf0d8142c80caa2cd1408f Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 22 Sep 2025 17:28:56 -0700 Subject: [PATCH 49/60] Tidy up --- filepush/dab/src/debug_table_config.py | 4 +-- filepush/dab/src/ingestion.py | 10 +++---- filepush/dab/src/utils/formatmanager.py | 8 +++--- filepush/dab/src/utils/tablemanager.py | 35 ++++++++++++++++++++----- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 31e64754..22928500 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -44,11 +44,9 @@ tablemanager.validate_config(table_config_json) table_name = table_config_json["name"] table_volume_path_data = tablemanager.get_table_volume_path(table_name) -table_reader = tablemanager.apply_table_config(spark.readStream, table_config_json) assert tablemanager.has_data_file(table_name), f"No data file found in {table_volume_path_data}. Please upload at least 1 file to {table_volume_path_data}" # Put schema location in temp directory with tempfile.TemporaryDirectory() as tmpdir: - table_reader.option("cloudFiles.schemaLocation", tmpdir) - display(table_reader.load(table_volume_path_data)) \ No newline at end of file + display(tablemanager.get_df_with_config(spark, table_config_json, tmpdir)) \ No newline at end of file diff --git a/filepush/dab/src/ingestion.py b/filepush/dab/src/ingestion.py index d6c3b996..1046a140 100644 --- a/filepush/dab/src/ingestion.py +++ b/filepush/dab/src/ingestion.py @@ -2,13 +2,13 @@ from utils import tablemanager from utils import formatmanager -def _make_append_flow(table_name, table_config, table_volume_path, format_mgr): +def _make_append_flow(table_name, table_config, table_volume_path): def _body(): - reader = tablemanager.apply_table_config(spark.readStream, table_config) # use _rescued_data as placeholder when no data file is present if not tablemanager.has_data_file(table_name): - reader = reader.schema(",".join(format_mgr.default_schema)) - return reader.load(table_volume_path) + return tablemanager.get_placeholder_df_with_config(spark, table_config) + else: + return tablemanager.get_df_with_config(spark, table_config) # give the function a unique name (nice for logs / debug) _body.__name__ = f"append_{table_name.lower()}" @@ -31,4 +31,4 @@ def _body(): table_properties={"filepush.table_volume_path_data": path}, expect_all=expts ) - _make_append_flow(tbl, cfg, path, fmt) + _make_append_flow(tbl, cfg, path) diff --git a/filepush/dab/src/utils/formatmanager.py b/filepush/dab/src/utils/formatmanager.py index 5f61ca7c..663b897a 100644 --- a/filepush/dab/src/utils/formatmanager.py +++ b/filepush/dab/src/utils/formatmanager.py @@ -24,8 +24,8 @@ def __init__(self): } self.default_schema: set[str] = {"_rescued_data STRING"} - def __iter__(self): - yield (self.name, self) + def get_default_schema(self) -> str: + return ", ".join(self.default_schema) def get_userfacing_options(self) -> dict[str, str]: return {opt.key: opt.value for opt in self.options if not opt.hidden} @@ -51,9 +51,9 @@ def get_merged_options(self, options: dict[str, str], table_name: str) -> dict[s merged = defaults.copy() merged.update({k: v for k, v in options.items() if k in defaults}) - # Format the moveDestination if table_name is supplied + # Format the moveDestination with table_name move_dest_key = "cloudFiles.cleanSource.moveDestination" - if table_name is not None and move_dest_key in merged: + if move_dest_key in merged: merged[move_dest_key] = merged[move_dest_key].format(table_name=table_name) return merged diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 83c8a97c..33376ded 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -3,6 +3,7 @@ from . import envmanager from . import formatmanager from pyspark.sql.streaming import DataStreamReader +from pyspark.sql import DataFrame, SparkSession from databricks.sdk import WorkspaceClient from databricks.sdk.errors.platform import NotFound @@ -52,25 +53,45 @@ def is_table_created(table_name: str) -> bool: ws = WorkspaceClient() return ws.tables.exists(full_name=f"{envmanager.get_config()['catalog_name']}.{envmanager.get_config()['schema_name']}.{table_name}").table_exists -def apply_table_config(reader: DataStreamReader, table_config: dict) -> DataStreamReader: - validate_config(table_config) +def _apply_table_options(reader: DataStreamReader, table_config: dict, fmt_mgr) -> DataStreamReader: name = table_config.get("name") fmt = table_config.get("format") - fmt_mgr = formatmanager.get_format_manager(fmt) # format options user_fmt_opts = table_config.get("format_options", {}) final_fmt_opts = fmt_mgr.get_merged_options(user_fmt_opts, name) - reader = reader.format("cloudFiles").option("cloudFiles.format", fmt) + reader = reader.option("cloudFiles.format", fmt) for k, v in final_fmt_opts.items(): reader = reader.option(k, v) # schema hints schema_hints = table_config.get("schema_hints") if schema_hints: - reader = reader.option("cloudFiles.schemaHints", ",".join({schema_hints} | fmt_mgr.default_schema)) + reader = reader.option("cloudFiles.schemaHints", ", ".join({schema_hints} | fmt_mgr.default_schema)) else: - reader = reader.option("cloudFiles.schemaHints", ",".join(fmt_mgr.default_schema)) + reader = reader.option("cloudFiles.schemaHints", ", ".join(fmt_mgr.default_schema)) return reader - \ No newline at end of file + +def get_df_with_config(spark: SparkSession, table_config: dict, schema_location: str = None) -> DataFrame: + validate_config(table_config) + fmt = table_config.get("format") + fmt_mgr = formatmanager.get_format_manager(fmt) + + reader = spark.readStream.format("cloudFiles") + reader = _apply_table_options(reader, table_config, fmt_mgr) + if schema_location: + reader = reader.option("cloudFiles.schemaLocation", schema_location) + + # include file metadata + return reader.load(get_table_volume_path(table_config.get("name"))).selectExpr("*", "_metadata") + +def get_placeholder_df_with_config(spark: SparkSession, table_config: dict) -> DataFrame: + validate_config(table_config) + fmt = table_config.get("format") + fmt_mgr = formatmanager.get_format_manager(fmt) + + reader = spark.readStream.format("cloudFiles") + reader = _apply_table_options(reader, table_config, fmt_mgr).schema(fmt_mgr.get_default_schema()) + + return reader.load(get_table_volume_path(table_config.get("name"))) \ No newline at end of file From 628753e266b6bf422b39db231bf961fde0f29720 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 13:24:25 -0700 Subject: [PATCH 50/60] Fix file name --- filepush/{REDME.md => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename filepush/{REDME.md => README.md} (100%) diff --git a/filepush/REDME.md b/filepush/README.md similarity index 100% rename from filepush/REDME.md rename to filepush/README.md From 2fdaea37e3b60f982aab1548d8e286ea1867cbb1 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 14:54:30 -0700 Subject: [PATCH 51/60] Update default target and decription --- filepush/dab/databricks.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/filepush/dab/databricks.yml b/filepush/dab/databricks.yml index 411ab284..bed0f416 100644 --- a/filepush/dab/databricks.yml +++ b/filepush/dab/databricks.yml @@ -7,24 +7,21 @@ bundle: include: - resources/*.yml -# experimental: -# skip_name_prefix_for_schema: true - targets: # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html dev: mode: development - default: true workspace: host: https://e2-dogfood.staging.cloud.databricks.com prod: mode: production + default: true workspace: host: https://e2-dogfood.staging.cloud.databricks.com - # root_path: /Workspace/Users/chi.yang@databricks.com/.bundle/${bundle.name}/${bundle.target} + root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target} permissions: - - user_name: chi.yang@databricks.com + - user_name: ${workspace.current_user.userName} level: CAN_MANAGE variables: @@ -32,9 +29,8 @@ variables: description: The existing catalog where the schema will be created. default: main schema_name: - description: The name of the schema where the tables and ingestion pipeline will be created. + description: The name of the NEW schema where the tables will be created. default: filepushschema resource_name_prefix: description: The prefix for the resource names. default: ${var.catalog_name}_${var.schema_name}_ - \ No newline at end of file From c17b7324f8a8ec268192982ab758005a61b4b21c Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 14:54:54 -0700 Subject: [PATCH 52/60] Print environment to console --- filepush/dab/src/utils/initialization.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/filepush/dab/src/utils/initialization.py b/filepush/dab/src/utils/initialization.py index 211d26d8..aa106072 100644 --- a/filepush/dab/src/utils/initialization.py +++ b/filepush/dab/src/utils/initialization.py @@ -56,11 +56,14 @@ logger.info(f"Volume {volume_path_root} configured") # Dump configs to environment json +all_configs = { + "catalog_name": catalog_name, + "schema_name": schema_name, + "volume_path_root": volume_path_root, + "volume_path_data": volume_path_data, + "volume_path_archive": volume_path_archive +} with open("../configs/environment.json", "w") as f: - json.dump({ - "catalog_name": catalog_name, - "schema_name": schema_name, - "volume_path_root": volume_path_root, - "volume_path_data": volume_path_data, - "volume_path_archive": volume_path_archive - }, f) + json.dump(all_configs, f) + +logger.info(f"==========\n%s\n==========", "\n".join(f"{k}: {v}" for k, v in all_configs.items())) From e3fb6334b7ef02f703eabc87981f1442a939cd1d Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:16:15 -0700 Subject: [PATCH 53/60] More doc in debug notebook --- filepush/dab/src/debug_table_config.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/filepush/dab/src/debug_table_config.py b/filepush/dab/src/debug_table_config.py index 22928500..0d697fcd 100644 --- a/filepush/dab/src/debug_table_config.py +++ b/filepush/dab/src/debug_table_config.py @@ -1,6 +1,6 @@ # Databricks notebook source # MAGIC %md -# MAGIC # Paste the table config JSON you would like to debug and assign to variable `table_config` +# MAGIC ## Paste the table config JSON you would like to debug from `./configs/tables.json` and assign to variable `table_config` # MAGIC For example, # MAGIC ``` # MAGIC table_config = r''' @@ -15,6 +15,7 @@ # MAGIC } # MAGIC ''' # MAGIC ``` +# MAGIC Only `name` and `format` are required for a table. # COMMAND ---------- @@ -31,6 +32,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Click `Run all` and inspect the parsed result. Iterate on the config until the result looks good + +# COMMAND ---------- + import json import tempfile from utils import tablemanager @@ -49,4 +55,9 @@ # Put schema location in temp directory with tempfile.TemporaryDirectory() as tmpdir: - display(tablemanager.get_df_with_config(spark, table_config_json, tmpdir)) \ No newline at end of file + display(tablemanager.get_df_with_config(spark, table_config_json, tmpdir)) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Copy and paste the modified config back to the `./configs/tables.json` in the DAB folder \ No newline at end of file From 7dc7dcfd4d760bb7d1315a0ea166c642783cd117 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:18:44 -0700 Subject: [PATCH 54/60] Add README --- filepush/README.md | 117 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/filepush/README.md b/filepush/README.md index 195287c9..a4321c40 100644 --- a/filepush/README.md +++ b/filepush/README.md @@ -11,3 +11,120 @@ tags: --- # Managed File Push +## Table of Contents +- [Quick Start](#quick-start) +- [Debug Table Issues](#debug-table-issues) + +## Quick Start +### Step 1. Configure tables +Define the catalog and a NEW schema name where the tables will land in `./dab/databricks.yml` +``` +variables: + catalog_name: + description: The existing catalog where the NEW schema will be created. + default: main + schema_name: + description: The name of the NEW schema where the tables will be created. + default: filepushschema + +``` +Edit the table configs in `./dab/src/configs/tables.json`. Only `name` and `format` are required for a table. + +For possible `format_options` checkout [Auto Loader Options article](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options). Not all options are supported. If you are not sure, feel free to specify only the `name` and `format`, or follow steps in [Debug Table Issues](#debug-table-issues) section to help come up with the proper options. +``` +[ + { + "name": "table1", + "format": "csv", + "format_options": { + "escape": "\"" + }, + "schema_hints": "id int, name string" + }, + { + "name": "table2", + "format": "json" + } + , + ... +] + +``` + +### Step 2. Deploy & setup +``` +$ cd dab +$ databricks bundle deploy +$ databricks bundle run configuration_job +``` +Wait for the configuration job to finish before moving to the next step. + +### Step 3. Retrieve endpoint & push files +Get the volume path for uploading the files +``` +$ databricks tables get main.filepushschema.table1 --output json | jq '.properties["filepush.table_volume_path_data"]' +``` +Example output: +``` +"/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1" +``` +Upload files to the path above using the [UC Volume APIs of your choice](https://docs.databricks.com/aws/en/volumes/volume-files#methods-for-managing-files-in-volumes). Here is an example using the **REST API**: +``` +$ curl --request PUT https:///api/2.0/fs/files/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1/datafile1.csv" \ + --header "Authorization: Bearer " \ + --header "Content-Type: application/octet-stream \ + --data-binary "@/local/file/path/datafile1.csv" +``` +Here is another example using the **Databricks CLI**. This way you do not need to specify the file name at destination. Pay attention to the `dbfs:` URL scheme for the destination path: +``` +$ databricks fs cp /local/file/path/datafile1.csv dbfs:/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1 +``` + +After maximum 1 minute, the data should land the corresponding table e.g. `main.filepushschema.table1` + +## Debug Table Issues +In case the data is not parsed correctly in the destination table, follow the steps below to fix the table configs. +### Step 1. Configure tables to debug +Configure tables just like [Step 1 in Quick Start](#step-1-configure-tables). + +### Step 2. Deploy & Setup in ***dev mode*** +``` +$ cd dab +$ databricks bundle deploy -t dev +$ databricks bundle run configuration_job -t dev +``` +Wait for the configuration job to finish before moving to the next step. Example output: +``` +2025-09-23 22:03:04,938 [INFO] initialization - ========== +catalog_name: main +schema_name: dev_chi_yang_filepushschema +volume_path_root: /Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume +volume_path_data: /Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/data +volume_path_archive: /Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/archive +========== +``` +Pay attention that, ***dev mode put a prefix to the schema name***, and you should use the name output by the initialization job for the remaining steps. + +### Step 3. Retrieve endpoint & push files to debug +Get the volume path for uploading the files, pay attention to the ***prefix*** name of the schema: +``` +$ databricks tables get main.dev_chi_yang_filepushschema.table1 --output json | jq '.properties["filepush.table_volume_path_data"]' +``` +Example output: +``` +"/Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/data/table1" +``` +Follow the remaining steps of [Step 3 in Quick Start](#step-3-retrieve-endpoint--push-files) to push files for debug. + +### Step 4. Debug table configs +Open the `refresh_pipeline` in the workspace: +``` +$ databricks bundle open refresh_pipeline -t dev +``` +Then click `Edit pipeline` to launch the development UI. Open the notebook `debug_table_config` and follow the instruction there to fix the table configs. Remember to copy over the config to the table configs in `./dab/src/configs/tables.json`. + +### Step 5. Fix the table configs in production +Go though [Step 2 in Quick Start](#step-2-deploy--setup) to deploy the updated config, then issue a full-refresh to fix the problematic data in the table: +``` +$ databricks bundle run refresh_pipeline --full-refresh table1 +``` From e5ff8f4831d9c70e0fd97578df2ba3d40c68305c Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:28:42 -0700 Subject: [PATCH 55/60] Beautify README --- filepush/README.md | 160 +++++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 55 deletions(-) diff --git a/filepush/README.md b/filepush/README.md index a4321c40..769121f8 100644 --- a/filepush/README.md +++ b/filepush/README.md @@ -4,21 +4,36 @@ language: python author: "Chi Yang" date: 2025-08-07 -tags: +tags: - ingestion - file - nocode --- # Managed File Push + +A lightweight, no‑code file ingestion workflow. Configure a set of tables, get a volume path for each, and drop files into those paths—your data lands in Unity Catalog tables via Auto Loader. + ## Table of Contents - [Quick Start](#quick-start) + - [Step 1. Configure tables](#step-1-configure-tables) + - [Step 2. Deploy & set up](#step-2-deploy--set-up) + - [Step 3. Retrieve endpoint & push files](#step-3-retrieve-endpoint--push-files) - [Debug Table Issues](#debug-table-issues) + - [Step 1. Configure tables to debug](#step-1-configure-tables-to-debug) + - [Step 2. Deploy & set up in dev mode](#step-2-deploy--set-up-in-dev-mode) + - [Step 3. Retrieve endpoint & push files to debug](#step-3-retrieve-endpoint--push-files-to-debug) + - [Step 4. Debug table configs](#step-4-debug-table-configs) + - [Step 5. Fix the table configs in production](#step-5-fix-the-table-configs-in-production) + +--- ## Quick Start + ### Step 1. Configure tables -Define the catalog and a NEW schema name where the tables will land in `./dab/databricks.yml` -``` +Define the catalog and a **new** schema name where the tables will land in `./dab/databricks.yml`: + +```yaml variables: catalog_name: description: The existing catalog where the NEW schema will be created. @@ -26,75 +41,94 @@ variables: schema_name: description: The name of the NEW schema where the tables will be created. default: filepushschema - ``` -Edit the table configs in `./dab/src/configs/tables.json`. Only `name` and `format` are required for a table. -For possible `format_options` checkout [Auto Loader Options article](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options). Not all options are supported. If you are not sure, feel free to specify only the `name` and `format`, or follow steps in [Debug Table Issues](#debug-table-issues) section to help come up with the proper options. -``` +Edit table configs in `./dab/src/configs/tables.json`. Only `name` and `format` are required. + +For supported `format_options`, see the [Auto Loader options](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options). Not all options are supported here. If unsure, specify only `name` and `format`, or follow [Debug Table Issues](#debug-table-issues) to discover the correct options. + +```json [ { "name": "table1", "format": "csv", - "format_options": { - "escape": "\"" - }, + "format_options": { "escape": "\\"" }, "schema_hints": "id int, name string" }, { "name": "table2", "format": "json" } - , - ... + // ... ] - ``` -### Step 2. Deploy & setup -``` -$ cd dab -$ databricks bundle deploy -$ databricks bundle run configuration_job +> **Tip:** Keep `schema_hints` minimal; Auto Loader can evolve the schema as new columns appear. + +### Step 2. Deploy & set up + +```bash +cd dab +_databricks bundle deploy +_databricks bundle run configuration_job ``` -Wait for the configuration job to finish before moving to the next step. + +Wait for the configuration job to finish before moving on. ### Step 3. Retrieve endpoint & push files -Get the volume path for uploading the files -``` -$ databricks tables get main.filepushschema.table1 --output json | jq '.properties["filepush.table_volume_path_data"]' +Fetch the volume path for uploading files to a specific table (example: `table1`): + +```bash +databricks tables get main.filepushschema.table1 --output json \ + | jq -r '.properties["filepush.table_volume_path_data"]' ``` + Example output: -``` + +```text "/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1" ``` -Upload files to the path above using the [UC Volume APIs of your choice](https://docs.databricks.com/aws/en/volumes/volume-files#methods-for-managing-files-in-volumes). Here is an example using the **REST API**: -``` -$ curl --request PUT https:///api/2.0/fs/files/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1/datafile1.csv" \ - --header "Authorization: Bearer " \ - --header "Content-Type: application/octet-stream \ - --data-binary "@/local/file/path/datafile1.csv" -``` -Here is another example using the **Databricks CLI**. This way you do not need to specify the file name at destination. Pay attention to the `dbfs:` URL scheme for the destination path: + +Upload files to the path above using any of the [Volumes file APIs](https://docs.databricks.com/aws/en/volumes/volume-files#methods-for-managing-files-in-volumes). + +**REST API example**: + +```bash +# prerequisites: export DATABRICKS_HOST and DATABRICKS_TOKEN +curl -X PUT "$DATABRICKS_HOST/api/2.0/fs/files/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1/datafile1.csv" \ + -H "Authorization: Bearer $DATABRICKS_TOKEN" \ + -H "Content-Type: application/octet-stream" \ + --data-binary @"/local/file/path/datafile1.csv" ``` -$ databricks fs cp /local/file/path/datafile1.csv dbfs:/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1 + +**Databricks CLI example** (destination uses the `dbfs:` scheme): + +```bash +databricks fs cp /local/file/path/datafile1.csv \ + dbfs:/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1 ``` -After maximum 1 minute, the data should land the corresponding table e.g. `main.filepushschema.table1` +Within about a minute, the data should appear in the table `main.filepushschema.table1`. + +--- ## Debug Table Issues -In case the data is not parsed correctly in the destination table, follow the steps below to fix the table configs. +If data isn’t parsed as expected, use **dev mode** to iterate on table options safely. + ### Step 1. Configure tables to debug -Configure tables just like [Step 1 in Quick Start](#step-1-configure-tables). +Configure tables as in [Step 1 of Quick Start](#step-1-configure-tables). -### Step 2. Deploy & Setup in ***dev mode*** -``` -$ cd dab -$ databricks bundle deploy -t dev -$ databricks bundle run configuration_job -t dev -``` -Wait for the configuration job to finish before moving to the next step. Example output: +### Step 2. Deploy & set up in **dev mode** + +```bash +cd dab +databricks bundle deploy -t dev +databricks bundle run configuration_job -t dev ``` + +Wait for the configuration job to finish. Example output: + +```text 2025-09-23 22:03:04,938 [INFO] initialization - ========== catalog_name: main schema_name: dev_chi_yang_filepushschema @@ -103,28 +137,44 @@ volume_path_data: /Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_ volume_path_archive: /Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/archive ========== ``` -Pay attention that, ***dev mode put a prefix to the schema name***, and you should use the name output by the initialization job for the remaining steps. + +> **Note:** In **dev mode**, the schema name is **prefixed**. Use the printed schema name for the remaining steps. ### Step 3. Retrieve endpoint & push files to debug -Get the volume path for uploading the files, pay attention to the ***prefix*** name of the schema: -``` -$ databricks tables get main.dev_chi_yang_filepushschema.table1 --output json | jq '.properties["filepush.table_volume_path_data"]' +Get the dev volume path (note the prefixed schema): + +```bash +databricks tables get main.dev_chi_yang_filepushschema.table1 --output json \ + | jq -r '.properties["filepush.table_volume_path_data"]' ``` + Example output: -``` + +```text "/Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/data/table1" ``` -Follow the remaining steps of [Step 3 in Quick Start](#step-3-retrieve-endpoint--push-files) to push files for debug. + +Then follow the upload instructions from [Quick Start → Step 3](#step-3-retrieve-endpoint--push-files) to send test files. ### Step 4. Debug table configs -Open the `refresh_pipeline` in the workspace: -``` -$ databricks bundle open refresh_pipeline -t dev +Open the pipeline in the workspace: + +```bash +databricks bundle open refresh_pipeline -t dev ``` -Then click `Edit pipeline` to launch the development UI. Open the notebook `debug_table_config` and follow the instruction there to fix the table configs. Remember to copy over the config to the table configs in `./dab/src/configs/tables.json`. + +Click **Edit pipeline** to launch the development UI. Open the `debug_table_config` notebook and follow its guidance to refine the table options. When satisfied, copy the final config back to `./dab/src/configs/tables.json`. ### Step 5. Fix the table configs in production -Go though [Step 2 in Quick Start](#step-2-deploy--setup) to deploy the updated config, then issue a full-refresh to fix the problematic data in the table: -``` -$ databricks bundle run refresh_pipeline --full-refresh table1 +Redeploy the updated config and run a full refresh to correct existing data for an affected table: + +```bash +cd dab +databricks bundle deploy +databricks bundle run refresh_pipeline --full-refresh table1 ``` + +--- + +**That’s it!** You now have a managed file‑push workflow with debuggable table configs and repeatable deployments. + From 94b394af0733f799fece36bb5e26665b10c2fd72 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:32:12 -0700 Subject: [PATCH 56/60] Fix a display issue in README --- filepush/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/filepush/README.md b/filepush/README.md index 769121f8..9ab7e2e5 100644 --- a/filepush/README.md +++ b/filepush/README.md @@ -52,14 +52,13 @@ For supported `format_options`, see the [Auto Loader options](https://docs.datab { "name": "table1", "format": "csv", - "format_options": { "escape": "\\"" }, + "format_options": { "escape": "\"" }, "schema_hints": "id int, name string" }, { "name": "table2", "format": "json" } - // ... ] ``` From 9003a6c585b3564ca03b39e660567d45ce88e981 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:36:38 -0700 Subject: [PATCH 57/60] newline --- filepush/dab/src/utils/tablemanager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/filepush/dab/src/utils/tablemanager.py b/filepush/dab/src/utils/tablemanager.py index 33376ded..5aaf9a55 100644 --- a/filepush/dab/src/utils/tablemanager.py +++ b/filepush/dab/src/utils/tablemanager.py @@ -94,4 +94,5 @@ def get_placeholder_df_with_config(spark: SparkSession, table_config: dict) -> D reader = spark.readStream.format("cloudFiles") reader = _apply_table_options(reader, table_config, fmt_mgr).schema(fmt_mgr.get_default_schema()) - return reader.load(get_table_volume_path(table_config.get("name"))) \ No newline at end of file + return reader.load(get_table_volume_path(table_config.get("name"))) + \ No newline at end of file From 4df63801f150f388a3ed8b0d662d4cc60b96bdb0 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Tue, 23 Sep 2025 16:41:33 -0700 Subject: [PATCH 58/60] Update example name --- filepush/dab/src/configs/tables.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filepush/dab/src/configs/tables.json b/filepush/dab/src/configs/tables.json index 98c4591f..3926a1bc 100644 --- a/filepush/dab/src/configs/tables.json +++ b/filepush/dab/src/configs/tables.json @@ -1,6 +1,6 @@ [ { - "name": "employees", + "name": "example_table", "format": "csv", "format_options": { "escape": "\"" From 2f55c3d276a69190ac43e47be42741ce2ef13bed Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Wed, 24 Sep 2025 09:40:46 -0700 Subject: [PATCH 59/60] Fix typo in doc and enrich instructions --- filepush/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/filepush/README.md b/filepush/README.md index 9ab7e2e5..96cf9ec6 100644 --- a/filepush/README.md +++ b/filepush/README.md @@ -68,8 +68,8 @@ For supported `format_options`, see the [Auto Loader options](https://docs.datab ```bash cd dab -_databricks bundle deploy -_databricks bundle run configuration_job +databricks bundle deploy +databricks bundle run configuration_job ``` Wait for the configuration job to finish before moving on. @@ -93,7 +93,7 @@ Upload files to the path above using any of the [Volumes file APIs](https://docs **REST API example**: ```bash -# prerequisites: export DATABRICKS_HOST and DATABRICKS_TOKEN +# prerequisites: export DATABRICKS_HOST and DATABRICKS_TOKEN (PAT token) curl -X PUT "$DATABRICKS_HOST/api/2.0/fs/files/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1/datafile1.csv" \ -H "Authorization: Bearer $DATABRICKS_TOKEN" \ -H "Content-Type: application/octet-stream" \ From ce62a8b5328d1424fad2a756782897cce8fb4355 Mon Sep 17 00:00:00 2001 From: chi-yang-db Date: Mon, 29 Sep 2025 14:44:33 -0700 Subject: [PATCH 60/60] Add more comments --- filepush/README.md | 4 ++-- filepush/dab/databricks.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/filepush/README.md b/filepush/README.md index 96cf9ec6..64750f30 100644 --- a/filepush/README.md +++ b/filepush/README.md @@ -85,7 +85,7 @@ databricks tables get main.filepushschema.table1 --output json \ Example output: ```text -"/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1" +/Volumes/main/filepushschema/main_filepushschema_filepush_volume/data/table1 ``` Upload files to the path above using any of the [Volumes file APIs](https://docs.databricks.com/aws/en/volumes/volume-files#methods-for-managing-files-in-volumes). @@ -150,7 +150,7 @@ databricks tables get main.dev_chi_yang_filepushschema.table1 --output json \ Example output: ```text -"/Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/data/table1" +/Volumes/main/dev_chi_yang_filepushschema/main_filepushschema_filepush_volume/data/table1 ``` Then follow the upload instructions from [Quick Start → Step 3](#step-3-retrieve-endpoint--push-files) to send test files. diff --git a/filepush/dab/databricks.yml b/filepush/dab/databricks.yml index bed0f416..c9c1729b 100644 --- a/filepush/dab/databricks.yml +++ b/filepush/dab/databricks.yml @@ -26,8 +26,8 @@ targets: variables: catalog_name: - description: The existing catalog where the schema will be created. - default: main + description: The existing catalog where the NEW schema will be created. + default: chi_catalog schema_name: description: The name of the NEW schema where the tables will be created. default: filepushschema