diff --git a/.vscode/settings.json b/.vscode/settings.json index 236981053..878eab807 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,7 +3,7 @@ "editor.cursorBlinking": "solid", "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace", "editor.fontLigatures": false, - "editor.fontSize": 22, + "editor.fontSize": 12, "editor.formatOnPaste": true, "editor.formatOnSave": true, "editor.lineNumbers": "on", @@ -16,8 +16,7 @@ "explorer.openEditors.visible": 0, "files.autoSave": "afterDelay", "screencastMode.onlyKeyboardShortcuts": true, - "terminal.integrated.fontSize": 18, - "workbench.activityBar.visible": true, + "terminal.integrated.fontSize": 12, "workbench.colorTheme": "Visual Studio Dark", "workbench.fontAliasing": "antialiased", "workbench.statusBar.visible": true diff --git a/assets/tutorial_files/models/docs/schema.yml b/assets/tutorial_files/models/docs/schema.yml index c1dfcb9fc..1d0f8b2d9 100644 --- a/assets/tutorial_files/models/docs/schema.yml +++ b/assets/tutorial_files/models/docs/schema.yml @@ -11,15 +11,11 @@ models: - name: all_other_areas description: '{{ doc("all_other_areas") }}' - - name: bronze_parking_violations + - name: bronze_parking_violations description: Raw data related to parking violations in 2023, encompassing various details about each violation. columns: - name: summons_number description: '{{ doc("summons_number") }}' - tests: - - unique - - not_null - - generic_not_null - name: registration_state description: '{{ doc("registration_state") }}' - name: plate_type diff --git a/data/nyc_parking_violations.db b/data/nyc_parking_violations.db index 19e587528..81f7cd1f2 100644 Binary files a/data/nyc_parking_violations.db and b/data/nyc_parking_violations.db differ diff --git a/data/prod_nyc_parking_violations.db b/data/prod_nyc_parking_violations.db index 9a2bd4ad7..008479de5 100644 Binary files a/data/prod_nyc_parking_violations.db and b/data/prod_nyc_parking_violations.db differ diff --git a/my_database.duckdb b/my_database.duckdb new file mode 100644 index 000000000..4965164bb Binary files /dev/null and b/my_database.duckdb differ diff --git a/nyc_parking_violations/.gitignore b/nyc_parking_violations/.gitignore index 9a7943d2d..49f147cb9 100644 --- a/nyc_parking_violations/.gitignore +++ b/nyc_parking_violations/.gitignore @@ -1,4 +1,4 @@ -.user.yml + target/ dbt_packages/ logs/ diff --git a/nyc_parking_violations/.user.yml b/nyc_parking_violations/.user.yml new file mode 100644 index 000000000..665b03e87 --- /dev/null +++ b/nyc_parking_violations/.user.yml @@ -0,0 +1 @@ +id: 57e9178d-61d6-4af0-b0a1-75dab50afcea diff --git a/nyc_parking_violations/dbt_project.yml b/nyc_parking_violations/dbt_project.yml index ec48605d3..abab4f061 100644 --- a/nyc_parking_violations/dbt_project.yml +++ b/nyc_parking_violations/dbt_project.yml @@ -1,13 +1,12 @@ - # Name your project! Project names should contain only lowercase characters # and underscores. A good package name should reflect your organization's # name or the intended use of these models -name: 'nyc_parking_violations' -version: '1.0.0' +name: "nyc_parking_violations" +version: "1.0.0" config-version: 2 # This setting configures which "profile" dbt uses for this project. -profile: 'nyc_parking_violations' +profile: "nyc_parking_violations" # These configurations specify where dbt should look for different types of files. # The `model-paths` config, for example, states that models in this project can be @@ -19,17 +18,22 @@ seed-paths: ["seeds"] macro-paths: ["macros"] snapshot-paths: ["snapshots"] -clean-targets: # directories to be removed by `dbt clean` +clean-targets: # directories to be removed by `dbt clean` - "target" - "dbt_packages" - # Configuring models # Full documentation: https://docs.getdbt.com/docs/configuring-models # In this example config, we tell dbt to build all models in the example/ # directory as views. These settings can be overridden in the individual model # files using the `{{ config(...) }}` macro. +# models: +# nyc_parking_violations: +# # Config indicated by + and applies to all files under models/example/ +# example: +# +materialized: view + models: nyc_parking_violations: # Config indicated by + and applies to all files under models/example/ diff --git a/nyc_parking_violations/models/bronze/bronze_parking_violation_codes.sql b/nyc_parking_violations/models/bronze/bronze_parking_violation_codes.sql index 217a5f263..4074ab031 100644 --- a/nyc_parking_violations/models/bronze/bronze_parking_violation_codes.sql +++ b/nyc_parking_violations/models/bronze/bronze_parking_violation_codes.sql @@ -4,4 +4,4 @@ SELECT manhattan_96th_st_below, all_other_areas FROM - parking_violation_codes + parking_violation_codes \ No newline at end of file diff --git a/nyc_parking_violations/models/docs/docs_blocks.md b/nyc_parking_violations/models/docs/docs_block.md similarity index 100% rename from nyc_parking_violations/models/docs/docs_blocks.md rename to nyc_parking_violations/models/docs/docs_block.md diff --git a/nyc_parking_violations/models/docs/schema.yml b/nyc_parking_violations/models/docs/schema.yml index c1dfcb9fc..ba62ad7b6 100644 --- a/nyc_parking_violations/models/docs/schema.yml +++ b/nyc_parking_violations/models/docs/schema.yml @@ -11,7 +11,7 @@ models: - name: all_other_areas description: '{{ doc("all_other_areas") }}' - - name: bronze_parking_violations + - name: bronze_parking_violations description: Raw data related to parking violations in 2023, encompassing various details about each violation. columns: - name: summons_number diff --git a/nyc_parking_violations/models/example/first_model.sql b/nyc_parking_violations/models/example/first_model.sql index 942e472df..e89f7f0f8 100644 --- a/nyc_parking_violations/models/example/first_model.sql +++ b/nyc_parking_violations/models/example/first_model.sql @@ -1 +1,2 @@ -SELECT * FROM parking_violation_codes \ No newline at end of file +SELECT * +FROM parking_violation_codes diff --git a/nyc_parking_violations/models/example/ref_model.sql b/nyc_parking_violations/models/example/ref_model.sql index 9bb4f51c6..8b3adac88 100644 --- a/nyc_parking_violations/models/example/ref_model.sql +++ b/nyc_parking_violations/models/example/ref_model.sql @@ -1,4 +1,2 @@ -SELECT - COUNT(*) -FROM - {{ref('first_model')}} +SELECT count(*) +FROM {{ ref('first_model') }} \ No newline at end of file diff --git a/nyc_parking_violations/models/gold/gold_vehicles_metrics.sql b/nyc_parking_violations/models/gold/gold_vehicle_metrics.sql similarity index 100% rename from nyc_parking_violations/models/gold/gold_vehicles_metrics.sql rename to nyc_parking_violations/models/gold/gold_vehicle_metrics.sql diff --git a/nyc_parking_violations/profiles.yml b/nyc_parking_violations/profiles.yml index cabe18c54..88545bab0 100644 --- a/nyc_parking_violations/profiles.yml +++ b/nyc_parking_violations/profiles.yml @@ -1,12 +1,9 @@ nyc_parking_violations: outputs: - dev: - type: duckdb - path: '../data/nyc_parking_violations.db' - prod: - type: duckdb - # note that path is slightly different as GitHub actions - # start in the root directory and not in the - # nyc_parking_violations directory - path: './data/prod_nyc_parking_violations.db' - target: dev \ No newline at end of file + dev: + type: duckdb + path: ../data/nyc_parking_violations.db + prod: + type: duckdb + path: ./data/prod_nyc_parking_violations.db + target: dev diff --git a/nyc_parking_violations/tests/generic/generic_not_null.sql b/nyc_parking_violations/tests/generic/generic_not_null.sql index e49c2b087..25843a106 100644 --- a/nyc_parking_violations/tests/generic/generic_not_null.sql +++ b/nyc_parking_violations/tests/generic/generic_not_null.sql @@ -1,8 +1,7 @@ --- source: https://docs.getdbt.com/guides/best-practices/writing-custom-generic-tests#generic-tests-with-default-config-values {% test generic_not_null(model, column_name) %} - select * - from {{ model }} - where {{ column_name }} is null +select * +from {{ model }} +where {{ column_name }} is null -{% endtest %} \ No newline at end of file +{% endtest %} diff --git a/nyc_parking_violations/tests/violation_codes_revenue.sql b/nyc_parking_violations/tests/violation_codes_revenue.sql index 091ddfdd4..2dde94e7c 100644 --- a/nyc_parking_violations/tests/violation_codes_revenue.sql +++ b/nyc_parking_violations/tests/violation_codes_revenue.sql @@ -1,11 +1,10 @@ -{{ config(severity = 'warn') }} - -SELECT - violation_code, - SUM(fee_usd) AS total_revenue_usd +{{config(severity='warn')}} +SELECT +violation_code, +sum(fee_usd) AS total_revenue_usd FROM - {{ref('silver_parking_violation_codes')}} +{{ref('silver_parking_violation_codes')}} GROUP BY - violation_code +violation_code HAVING - NOT(total_revenue_usd >= 1) +NOT(total_revenue_usd >= 1) \ No newline at end of file diff --git a/run_sql_queries_here.ipynb b/run_sql_queries_here.ipynb index f7c54e6c9..24bfb07a2 100644 --- a/run_sql_queries_here.ipynb +++ b/run_sql_queries_here.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -12,9 +12,116 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
0bronze_parking_violation_codes
1bronze_parking_violations
2first_model
3gold_ticket_metrics
4gold_vehicle_metrics
5my_first_dbt_model
6my_second_dbt_model
7parking_violation_codes
8parking_violations_2023
9ref_model
10silver_parking_violation_codes
11silver_parking_violations
12silver_violation_tickets
13silver_violation_vehicles
\n", + "
" + ], + "text/plain": [ + " name\n", + "0 bronze_parking_violation_codes\n", + "1 bronze_parking_violations\n", + "2 first_model\n", + "3 gold_ticket_metrics\n", + "4 gold_vehicle_metrics\n", + "5 my_first_dbt_model\n", + "6 my_second_dbt_model\n", + "7 parking_violation_codes\n", + "8 parking_violations_2023\n", + "9 ref_model\n", + "10 silver_parking_violation_codes\n", + "11 silver_parking_violations\n", + "12 silver_violation_tickets\n", + "13 silver_violation_vehicles" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "sql_query = '''\n", "show tables\n", @@ -23,11 +130,397 @@ "with duckdb.connect('data/nyc_parking_violations.db') as con:\n", " display(con.sql(sql_query).df())" ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "# Connect to your DuckDB database\n", + "db_path = \"data/nyc_parking_violations.db\"\n", + "with duckdb.connect(db_path) as con:\n", + " # Get all tables and views from the main schema\n", + " objects = con.execute(\"\"\"\n", + " SELECT table_name, table_type \n", + " FROM information_schema.tables \n", + " WHERE table_schema = 'main'\n", + " \"\"\").fetchall()\n", + "\n", + " # Drop each table or view\n", + " for table_name, table_type in objects:\n", + " if table_type.upper() == 'VIEW':\n", + " con.execute(f\"DROP VIEW IF EXISTS {table_name}\")\n", + " elif table_type.upper() == 'BASE TABLE':\n", + " con.execute(f\"DROP TABLE IF EXISTS {table_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "sql_query_import_1 = '''\n", + "CREATE OR REPLACE TABLE parking_violation_codes AS\n", + "SELECT *\n", + "from read_csv_auto('data/dof_parking_violation_codes.csv', \n", + "normalize_names=True)\n", + "'''\n", + "\n", + "sql_query_import_2 = '''\n", + "CREATE OR REPLACE TABLE parking_violations_2023 AS\n", + "SELECT *\n", + "from read_csv_auto('data/parking_violations_issued_fiscal_year_2023_sample.csv',\n", + "normalize_names=True)\n", + "'''\n", + "\n", + "with duckdb.connect('data/nyc_parking_violations.db') as con:\n", + " con.sql(sql_query_import_1)\n", + " con.sql(sql_query_import_2)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
registration_stateticket_count
0NJ9258
1PA3514
2FL2414
\n", + "
" + ], + "text/plain": [ + " registration_state ticket_count\n", + "0 NJ 9258\n", + "1 PA 3514\n", + "2 FL 2414" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_query = '''\n", + "select * from gold_vehicle_metrics limit 3\n", + "'''\n", + "\n", + "with duckdb.connect('data/nyc_parking_violations.db') as con:\n", + " display(con.sql(sql_query).df())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "┌────────────────┬───────────────────┐\n", + "│ violation_code │ total_revenue_usd │\n", + "│ int64 │ int128 │\n", + "├────────────────┼───────────────────┤\n", + "│ 41 │ 0 │\n", + "└────────────────┴───────────────────┘" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_query = '''\n", + "select * from \"nyc_parking_violations\".\"main_dbt_test__audit\".\"violation_codes_revenue\"\n", + "'''\n", + "\n", + "with duckdb.connect('data/nyc_parking_violations.db') as con:\n", + " display(con.sql(sql_query))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "sql_query_import_1 = '''\n", + "CREATE OR REPLACE TABLE parking_violation_codes AS\n", + "SELECT *\n", + "from read_csv_auto('data/dof_parking_violation_codes.csv', \n", + "normalize_names=True)\n", + "'''\n", + "\n", + "sql_query_import_2 = '''\n", + "CREATE OR REPLACE TABLE parking_violations_2023 AS\n", + "SELECT *\n", + "from read_csv_auto('data/parking_violations_issued_fiscal_year_2023_sample.csv',\n", + "normalize_names=True)\n", + "'''\n", + "\n", + "with duckdb.connect('data/prod_nyc_parking_violations.db') as con:\n", + " con.sql(sql_query_import_1)\n", + " con.sql(sql_query_import_2)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
0bronze_parking_violation_codes
1bronze_parking_violations
2gold_ticket_metrics
3gold_vehicle_metrics
4gold_vehicles_metrics
5parking_violation_codes
6parking_violations_2023
7silver_violation_tickets
8silver_violation_vehicles
\n", + "
" + ], + "text/plain": [ + " name\n", + "0 bronze_parking_violation_codes\n", + "1 bronze_parking_violations\n", + "2 gold_ticket_metrics\n", + "3 gold_vehicle_metrics\n", + "4 gold_vehicles_metrics\n", + "5 parking_violation_codes\n", + "6 parking_violations_2023\n", + "7 silver_violation_tickets\n", + "8 silver_violation_vehicles" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_query = '''\n", + "show tables\n", + "'''\n", + "\n", + "with duckdb.connect('data/prod_nyc_parking_violations.db') as con:\n", + " display(con.sql(sql_query).df())" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
097
\n", + "
" + ], + "text/plain": [ + " count\n", + "0 97" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_query = '''\n", + "select count(*) as count\n", + "from prod_nyc_parking_violations.bronze_parking_violation_codes\n", + "'''\n", + "\n", + "with duckdb.connect('data/prod_nyc_parking_violations.db') as con:\n", + " display(con.sql(sql_query).df())" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
076
\n", + "
" + ], + "text/plain": [ + " count\n", + "0 76" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql_query = '''\n", + "select count(*) as count\n", + "from prod_nyc_parking_violations.gold_ticket_metrics\n", + "'''\n", + "\n", + "with duckdb.connect('data/prod_nyc_parking_violations.db') as con:\n", + " display(con.sql(sql_query).df())" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv11", "language": "python", "name": "python3" }, @@ -41,14 +534,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.11" }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } - } + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2