diff --git a/BritneH_Unit 3 SC 2.zip b/BritneH_Unit 3 SC 2.zip new file mode 100644 index 00000000..ce390d40 Binary files /dev/null and b/BritneH_Unit 3 SC 2.zip differ diff --git a/BritneH_Unit 3 SC 2/Instructions.md b/BritneH_Unit 3 SC 2/Instructions.md new file mode 100644 index 00000000..003aa386 --- /dev/null +++ b/BritneH_Unit 3 SC 2/Instructions.md @@ -0,0 +1,144 @@ +# Data Science Unit 3 Sprint Challenge 2 + +## Databases and SQL + +A SQL Query walks into a bar. In one corner of the bar are two tables. The Query +walks up to the tables and asks: + +... + +*"Mind if I join you?"* + +--- + +In this sprint challenge you will write code and answer questions related to +databases, with a focus on SQL but an acknowledgment of the broader ecosystem. +You may use any tools and references you wish, but your final code should +reflect *your* work and be saved in `.py` files (*not* notebooks), and (along +with this file including your written answers) turned in directly to your TL. + +For all your code, you may only import/use the following: +- other modules you write +- `sqlite3` (from the standard library) + +As always, make sure to manage your time - get a section/question to "good +enough" and then move on to make sure you do everything. You can always revisit +and polish at the end if time allows. + +This file is Markdown, so it may be helpful to open with VS Code or another tool +that allows you to view it nicely rendered. + +Good luck! + +### Part 1 - Making and populating a Database + +Consider the following data: + +| s | x | y | +|-----|---|---| +| 'g' | 3 | 9 | +| 'v' | 5 | 7 | +| 'f' | 8 | 7 | + +Using the standard `sqlite3` module: + +- Open a connection to a new (blank) database file `demo_data.sqlite3` +- Make a cursor, and execute an appropriate `CREATE TABLE` statement to accept + the above data (name the table `demo`) +- Write and execute appropriate `INSERT INTO` statements to add the data (as + shown above) to the database + +Make sure to `commit()` so your data is saved! The file size should be non-zero. + +Then write the following queries (also with `sqlite3`) to test: + +- Count how many rows you have - it should be 3! +- How many rows are there where both `x` and `y` are at least 5? +- How many unique values of `y` are there (hint - `COUNT()` can accept a keyword + `DISTINCT`)? + +Your code (to reproduce all above steps) should be saved in `demo_data.py` and +added to the repository along with the generated SQLite database. + +### Part 2 - The Northwind Database + +Using `sqlite3`, connect to the given `northwind_small.sqlite3` database. + +![Northwind Entity-Relationship Diagram](./northwind_erd.png) + +Above is an entity-relationship diagram - a picture summarizing the schema and +relationships in the database. Note that it was generated using Microsoft +Access, and some of the specific table/field names are different in the provided +data. You can see all the tables available to SQLite as follows: + +```python +>>> curs.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY +name;").fetchall() +[('Category',), ('Customer',), ('CustomerCustomerDemo',), +('CustomerDemographic',), ('Employee',), ('EmployeeTerritory',), ('Order',), +('OrderDetail',), ('Product',), ('Region',), ('Shipper',), ('Supplier',), +('Territory',)] +``` + +*Warning*: unlike the diagram, the tables in SQLite are singular and not plural +(do not end in `s`). And you can see the schema (`CREATE TABLE` statement) +behind any given table with: +```python +>>> curs.execute('SELECT sql FROM sqlite_master WHERE name="Customer";').fetchall() +[('CREATE TABLE "Customer" \n(\n "Id" VARCHAR(8000) PRIMARY KEY, \n +"CompanyName" VARCHAR(8000) NULL, \n "ContactName" VARCHAR(8000) NULL, \n +"ContactTitle" VARCHAR(8000) NULL, \n "Address" VARCHAR(8000) NULL, \n "City" +VARCHAR(8000) NULL, \n "Region" VARCHAR(8000) NULL, \n "PostalCode" +VARCHAR(8000) NULL, \n "Country" VARCHAR(8000) NULL, \n "Phone" VARCHAR(8000) +NULL, \n "Fax" VARCHAR(8000) NULL \n)',)] +``` + +In particular note that the *primary* key is `Id`, and not `CustomerId`. On +other tables (where it is a *foreign* key) it will be `CustomerId`. Also note - +the `Order` table conflicts with the `ORDER` keyword! We'll just avoid that +particular table, but it's a good lesson in the danger of keyword conflicts. + +Answer the following questions (each is from a single table): + +- What are the ten most expensive items (per unit price) in the database? +- What is the average age of an employee at the time of their hiring? (Hint: a + lot of arithmetic works with dates.) +- (*Stretch*) How does the average age of employee at hire vary by city? + +Your code (to load and query the data) should be saved in `northwind.py`, and +added to the repository. Do your best to answer in purely SQL, but if necessary +use Python/other logic to help. + +### Part 3 - Sailing the Northwind Seas + +You've answered some basic questions from the Northwind database, looking at +individual tables - now it's time to put things together, and `JOIN`! + +Using `sqlite3` in `northwind.py`, answer the following: + +- What are the ten most expensive items (per unit price) in the database *and* + their suppliers? +- What is the largest category (by number of unique products in it)? +- (*Stretch*) Who's the employee with the most territories? Use `TerritoryId` + (not name, region, or other fields) as the unique identifier for territories. + +### Part 4 - Questions (and your Answers) + +Answer the following questions, baseline ~3-5 sentences each, as if they were +interview screening questions (a form you fill when applying for a job): + +- In the Northwind database, what is the type of relationship between the + `Employee` and `Territory` tables? +- What is a situation where a document store (like MongoDB) is appropriate, and + what is a situation where it is not appropriate? +- What is "NewSQL", and what is it trying to achieve? + +### Part 5 - Turn it in! +Provide all the files you wrote (`demo_data.py`, `northwind.py`), as well as +this file with your answers to part 4, directly to your TL. You're also +encouraged to include the output from your queries as docstring comments, to +facilitate grading and feedback. Thanks for your hard work! + +If you got this far, check out the [larger Northwind +database](https://github.com/jpwhite3/northwind-SQLite3/blob/master/Northwind_large.sqlite.zip) - +your queries should run on it as well, with richer results. diff --git a/BritneH_Unit 3 SC 2/Part2.ipynb b/BritneH_Unit 3 SC 2/Part2.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/BritneH_Unit 3 SC 2/demo_data.py b/BritneH_Unit 3 SC 2/demo_data.py new file mode 100644 index 00000000..28082d25 --- /dev/null +++ b/BritneH_Unit 3 SC 2/demo_data.py @@ -0,0 +1,48 @@ +import sqlite3 +conn = sqlite3.connect('demo_data.sqlite3') +curs = conn.cursor() +''' +Create Table called demo ! - accidentally mispelled +''' +create = "CREATE TABLE demp(s VARCHAR,x INT, y INT);" +query = create +curs.execute(query) + +''' +Fill in table: +''' +fill = ''' +INSERT INTO demp (s, x, y) +VALUES ('g', 3, 9), + ('v', 5, 7), + ('f', 8, 7) + ; + ''' +curs.execute(fill) +conn.commit() + +''' +Count number of rows: +''' +count_rows = '''SELECT COUNT(*) FROM demp''' +curs.execute(count_rows) + +''' +How many rows are there where both `x` and `y` are at least 5? +''' +x_and_y=''' +SELECT COUNT(*) +FROM demp +WHERE x>4 AND y>4 +''' +curs.execute(x_and_y) + +''' +How many unique values of `y` are there (hint - `COUNT()` can accept a keyword +`DISTINCT`)? +''' +distincty = ''' +SELECT COUNT (DISTINCT y) as d +FROM demp +''' +curs.execute(distincty) \ No newline at end of file diff --git a/BritneH_Unit 3 SC 2/demo_data.sqlite3 b/BritneH_Unit 3 SC 2/demo_data.sqlite3 new file mode 100644 index 00000000..ae0f5ca2 Binary files /dev/null and b/BritneH_Unit 3 SC 2/demo_data.sqlite3 differ diff --git a/BritneH_Unit 3 SC 2/northwind.py b/BritneH_Unit 3 SC 2/northwind.py new file mode 100644 index 00000000..459eb07a --- /dev/null +++ b/BritneH_Unit 3 SC 2/northwind.py @@ -0,0 +1,49 @@ +import sqlite3 +conn = sqlite3.connect('northwind_small.sqlite3') +curs = conn.cursor() +conn.commit() + +''' +What are the ten most expensive items (per unit price) in the database? +''' +most_expensive = ''' +SELECT * +FROM Product +ORDER BY UnitPrice DESC +LIMIT 10; +''' +curs.execute(most_expensive) + +''' +What is the average age of an employee at the time of their hiring? +''' +avg_age =''' +SELECT AVG(HireDate - BirthDate) +FROM Employee; +''' +curs.execute(avg_age) + +''' +- What are the ten most expensive items (per unit price) in the database *and* + their suppliers? +''' +top_supplier = ''' +SELECT ProductName, SupplierId, UnitPrice, CompanyName +FROM Product +JOIN Supplier +ON Product.SupplierId = Supplier.Id +ORDER BY UnitPrice DESC +LIMIT 10; +''' +curs.execute(top_supplier) + + +''' +- What is the largest category (by number of unique products in it)? +''' + +category = ''' +SELECT CategoryId, ProductName, COUNT (DISTINCT CategoryId) as total +FROM Product +''' +curs.execute(category) \ No newline at end of file diff --git a/BritneH_Unit 3 SC 2/northwind_small.sqlite3 b/BritneH_Unit 3 SC 2/northwind_small.sqlite3 new file mode 100644 index 00000000..3bd6af08 Binary files /dev/null and b/BritneH_Unit 3 SC 2/northwind_small.sqlite3 differ diff --git a/BritneH_Unit 3 SC 2/scratchpad.ipynb b/BritneH_Unit 3 SC 2/scratchpad.ipynb new file mode 100644 index 00000000..a129aa0e --- /dev/null +++ b/BritneH_Unit 3 SC 2/scratchpad.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "lmvpuzqe lmvpuzqe mY4V-4wY0JxKkXem9ELRV0jMJjmGKKyI otto.db.elephantsql.com\nCONNECTION: \nCURSOR: \n" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "import psycopg2\n", + "import pandas \n", + "\n", + "load_dotenv() #> loads contents of the .env file into the script's environment\n", + "\n", + "DB_NAME = os.getenv(\"DB_NAME\")\n", + "DB_USER = os.getenv(\"DB_USER\")\n", + "DB_PASSWORD = os.getenv(\"DB_PASSWORD\")\n", + "DB_HOST = os.getenv(\"DB_HOST\")\n", + "\n", + "print(DB_NAME, DB_USER, DB_PASSWORD, DB_HOST)\n", + "\n", + "connection = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST)\n", + "print(\"CONNECTION:\", connection)\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "sql_whatever = \"\"\"COPY titanic(survived,pclass,name,sex,age,siblings_spouses,parents_children,fare\n", + ") \n", + "FROM 'titanic.csv' with (format csv, header true, DELIMITER ',');\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "new_table =\"\"\" CREATE TABLE titanic2\n", + "(\n", + " survived INT,\n", + " pclass INT,\n", + " name VARCHAR(50),\n", + " sex VARCHAR(10),\n", + " age INT,\n", + " siblings_spouses INT,\n", + " parents_children INT,\n", + " fare FLOAT\n", + ");\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "new_query = \"CREATE TABLE titanic4 (Survived INT, Class INT, Name VARCHAR, Sex CHAR, Age FLOAT, Sibling_Spouse INT, Parent_Child INT, Fare FLOAT);\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "SyntaxError", + "evalue": "syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mSyntaxError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql_whatever\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mSyntaxError\u001b[0m: syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n" + ] + } + ], + "source": [ + "cursor.execute(sql_whatever)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute(new_query)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "connection.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "RESULT: \n[]\n" + } + ], + "source": [ + "cursor.execute('SELECT * from titanic3;')\n", + "result = cursor.fetchall()\n", + "print(\"RESULT:\", type(result))\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CURSOR: \n" + }, + { + "output_type": "error", + "ename": "DuplicateTable", + "evalue": "relation \"titanic3\" already exists\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mDuplicateTable\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# Skip the header row.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mreader\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m cursor.execute(new_query\n\u001b[0m\u001b[0;32m 10\u001b[0m )\n\u001b[0;32m 11\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mDuplicateTable\u001b[0m: relation \"titanic3\" already exists\n" + ] + } + ], + "source": [ + "import csv\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)\n", + "with open('titanic.csv', 'r') as f:\n", + " reader = csv.reader(f)\n", + " next(reader) # Skip the header row.\n", + " for row in reader:\n", + " cursor.execute(new_query\n", + " )\n", + "conn.commit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[(174,)]" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('rpg_db.sqlite3')\n", + "curs = conn.cursor()\n", + "\n", + "query = 'SELECT COUNT(*) FROM armory_item;'\n", + "curs.execute(query)\n", + "\n", + "curs.execute(query).fetchall()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "OperationalError", + "evalue": "table study_part1 already exists", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mstudents\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"CREATE TABLE study_part1 (student VARCHAR,studied VARCHAR,grade INT,age INT,sex VARCHAR);\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mquery\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstudents\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mOperationalError\u001b[0m: table study_part1 already exists" + ] + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('study_part1.sqlite3')\n", + "curs = conn.cursor()\n", + "students = \"CREATE TABLE study_part1 (student VARCHAR,studied VARCHAR,grade INT,age INT,sex VARCHAR);\"\n", + "query = students\n", + "curs.execute(query)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "fill = '''\n", + "INSERT INTO study_part1 (student, studied, grade, age, sex) \n", + "VALUES ('Lion-O', 'True', 85, 24, 'Male'),\n", + " ('Cheetara', 'True', 95, 22, 'Female'),\n", + " ('Mumm-Ra', 'False', 65, 153, 'Male'),\n", + " ('Snarf', 'False', 70, 15, 'Male'),\n", + " ('Panthro', 'True', 80, 30, 'Male');\n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[]" + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "curs.execute(fill)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'curs' is not defined", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mWHERE\u001b[0m \u001b[0msex\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;34m'Female'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m '''\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgender\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfetchall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'curs' is not defined" + ] + } + ], + "source": [ + "gender ='''SELECT student \n", + "FROM study_part1\n", + "WHERE sex= 'Female'\n", + "'''\n", + "curs.execute(gender).fetchall()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('demo_data.sqlite3')\n", + "curs = conn.cursor()\n", + "create = \"CREATE TABLE demp(s VARCHAR,x INT, y INT);\"\n", + "query = create\n", + "curs.execute(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "fill = '''\n", + "INSERT INTO demp (s, x, y) \n", + "VALUES ('g', 3, 9),\n", + " ('v', 5, 7),\n", + " ('f', 8, 7)\n", + " ;\n", + " '''\n", + "curs.execute(fill)\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "None\n" + } + ], + "source": [ + "count_rows = '''SELECT COUNT(*) FROM demp'''\n", + "curs.execute(count_rows)\n", + "print(conn.commit())" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "None\n" + } + ], + "source": [ + "x_and_y='''\n", + "SELECT COUNT(*)\n", + "FROM demp\n", + "WHERE x>4 AND y>4\n", + "'''\n", + "curs.execute(x_and_y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "" + }, + "metadata": {}, + "execution_count": 48 + } + ], + "source": [ + "distincty = '''\n", + "SELECT COUNT (DISTINCT y) as d\n", + "FROM demp\n", + "'''\n", + "curs.execute(distincty)" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Challenge/Instructions.md b/Challenge/Instructions.md new file mode 100644 index 00000000..003aa386 --- /dev/null +++ b/Challenge/Instructions.md @@ -0,0 +1,144 @@ +# Data Science Unit 3 Sprint Challenge 2 + +## Databases and SQL + +A SQL Query walks into a bar. In one corner of the bar are two tables. The Query +walks up to the tables and asks: + +... + +*"Mind if I join you?"* + +--- + +In this sprint challenge you will write code and answer questions related to +databases, with a focus on SQL but an acknowledgment of the broader ecosystem. +You may use any tools and references you wish, but your final code should +reflect *your* work and be saved in `.py` files (*not* notebooks), and (along +with this file including your written answers) turned in directly to your TL. + +For all your code, you may only import/use the following: +- other modules you write +- `sqlite3` (from the standard library) + +As always, make sure to manage your time - get a section/question to "good +enough" and then move on to make sure you do everything. You can always revisit +and polish at the end if time allows. + +This file is Markdown, so it may be helpful to open with VS Code or another tool +that allows you to view it nicely rendered. + +Good luck! + +### Part 1 - Making and populating a Database + +Consider the following data: + +| s | x | y | +|-----|---|---| +| 'g' | 3 | 9 | +| 'v' | 5 | 7 | +| 'f' | 8 | 7 | + +Using the standard `sqlite3` module: + +- Open a connection to a new (blank) database file `demo_data.sqlite3` +- Make a cursor, and execute an appropriate `CREATE TABLE` statement to accept + the above data (name the table `demo`) +- Write and execute appropriate `INSERT INTO` statements to add the data (as + shown above) to the database + +Make sure to `commit()` so your data is saved! The file size should be non-zero. + +Then write the following queries (also with `sqlite3`) to test: + +- Count how many rows you have - it should be 3! +- How many rows are there where both `x` and `y` are at least 5? +- How many unique values of `y` are there (hint - `COUNT()` can accept a keyword + `DISTINCT`)? + +Your code (to reproduce all above steps) should be saved in `demo_data.py` and +added to the repository along with the generated SQLite database. + +### Part 2 - The Northwind Database + +Using `sqlite3`, connect to the given `northwind_small.sqlite3` database. + +![Northwind Entity-Relationship Diagram](./northwind_erd.png) + +Above is an entity-relationship diagram - a picture summarizing the schema and +relationships in the database. Note that it was generated using Microsoft +Access, and some of the specific table/field names are different in the provided +data. You can see all the tables available to SQLite as follows: + +```python +>>> curs.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY +name;").fetchall() +[('Category',), ('Customer',), ('CustomerCustomerDemo',), +('CustomerDemographic',), ('Employee',), ('EmployeeTerritory',), ('Order',), +('OrderDetail',), ('Product',), ('Region',), ('Shipper',), ('Supplier',), +('Territory',)] +``` + +*Warning*: unlike the diagram, the tables in SQLite are singular and not plural +(do not end in `s`). And you can see the schema (`CREATE TABLE` statement) +behind any given table with: +```python +>>> curs.execute('SELECT sql FROM sqlite_master WHERE name="Customer";').fetchall() +[('CREATE TABLE "Customer" \n(\n "Id" VARCHAR(8000) PRIMARY KEY, \n +"CompanyName" VARCHAR(8000) NULL, \n "ContactName" VARCHAR(8000) NULL, \n +"ContactTitle" VARCHAR(8000) NULL, \n "Address" VARCHAR(8000) NULL, \n "City" +VARCHAR(8000) NULL, \n "Region" VARCHAR(8000) NULL, \n "PostalCode" +VARCHAR(8000) NULL, \n "Country" VARCHAR(8000) NULL, \n "Phone" VARCHAR(8000) +NULL, \n "Fax" VARCHAR(8000) NULL \n)',)] +``` + +In particular note that the *primary* key is `Id`, and not `CustomerId`. On +other tables (where it is a *foreign* key) it will be `CustomerId`. Also note - +the `Order` table conflicts with the `ORDER` keyword! We'll just avoid that +particular table, but it's a good lesson in the danger of keyword conflicts. + +Answer the following questions (each is from a single table): + +- What are the ten most expensive items (per unit price) in the database? +- What is the average age of an employee at the time of their hiring? (Hint: a + lot of arithmetic works with dates.) +- (*Stretch*) How does the average age of employee at hire vary by city? + +Your code (to load and query the data) should be saved in `northwind.py`, and +added to the repository. Do your best to answer in purely SQL, but if necessary +use Python/other logic to help. + +### Part 3 - Sailing the Northwind Seas + +You've answered some basic questions from the Northwind database, looking at +individual tables - now it's time to put things together, and `JOIN`! + +Using `sqlite3` in `northwind.py`, answer the following: + +- What are the ten most expensive items (per unit price) in the database *and* + their suppliers? +- What is the largest category (by number of unique products in it)? +- (*Stretch*) Who's the employee with the most territories? Use `TerritoryId` + (not name, region, or other fields) as the unique identifier for territories. + +### Part 4 - Questions (and your Answers) + +Answer the following questions, baseline ~3-5 sentences each, as if they were +interview screening questions (a form you fill when applying for a job): + +- In the Northwind database, what is the type of relationship between the + `Employee` and `Territory` tables? +- What is a situation where a document store (like MongoDB) is appropriate, and + what is a situation where it is not appropriate? +- What is "NewSQL", and what is it trying to achieve? + +### Part 5 - Turn it in! +Provide all the files you wrote (`demo_data.py`, `northwind.py`), as well as +this file with your answers to part 4, directly to your TL. You're also +encouraged to include the output from your queries as docstring comments, to +facilitate grading and feedback. Thanks for your hard work! + +If you got this far, check out the [larger Northwind +database](https://github.com/jpwhite3/northwind-SQLite3/blob/master/Northwind_large.sqlite.zip) - +your queries should run on it as well, with richer results. diff --git a/Challenge/Part2.ipynb b/Challenge/Part2.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/Challenge/demo_data.py b/Challenge/demo_data.py new file mode 100644 index 00000000..28082d25 --- /dev/null +++ b/Challenge/demo_data.py @@ -0,0 +1,48 @@ +import sqlite3 +conn = sqlite3.connect('demo_data.sqlite3') +curs = conn.cursor() +''' +Create Table called demo ! - accidentally mispelled +''' +create = "CREATE TABLE demp(s VARCHAR,x INT, y INT);" +query = create +curs.execute(query) + +''' +Fill in table: +''' +fill = ''' +INSERT INTO demp (s, x, y) +VALUES ('g', 3, 9), + ('v', 5, 7), + ('f', 8, 7) + ; + ''' +curs.execute(fill) +conn.commit() + +''' +Count number of rows: +''' +count_rows = '''SELECT COUNT(*) FROM demp''' +curs.execute(count_rows) + +''' +How many rows are there where both `x` and `y` are at least 5? +''' +x_and_y=''' +SELECT COUNT(*) +FROM demp +WHERE x>4 AND y>4 +''' +curs.execute(x_and_y) + +''' +How many unique values of `y` are there (hint - `COUNT()` can accept a keyword +`DISTINCT`)? +''' +distincty = ''' +SELECT COUNT (DISTINCT y) as d +FROM demp +''' +curs.execute(distincty) \ No newline at end of file diff --git a/Challenge/demo_data.sqlite3 b/Challenge/demo_data.sqlite3 new file mode 100644 index 00000000..ae0f5ca2 Binary files /dev/null and b/Challenge/demo_data.sqlite3 differ diff --git a/Challenge/northwind.py b/Challenge/northwind.py new file mode 100644 index 00000000..459eb07a --- /dev/null +++ b/Challenge/northwind.py @@ -0,0 +1,49 @@ +import sqlite3 +conn = sqlite3.connect('northwind_small.sqlite3') +curs = conn.cursor() +conn.commit() + +''' +What are the ten most expensive items (per unit price) in the database? +''' +most_expensive = ''' +SELECT * +FROM Product +ORDER BY UnitPrice DESC +LIMIT 10; +''' +curs.execute(most_expensive) + +''' +What is the average age of an employee at the time of their hiring? +''' +avg_age =''' +SELECT AVG(HireDate - BirthDate) +FROM Employee; +''' +curs.execute(avg_age) + +''' +- What are the ten most expensive items (per unit price) in the database *and* + their suppliers? +''' +top_supplier = ''' +SELECT ProductName, SupplierId, UnitPrice, CompanyName +FROM Product +JOIN Supplier +ON Product.SupplierId = Supplier.Id +ORDER BY UnitPrice DESC +LIMIT 10; +''' +curs.execute(top_supplier) + + +''' +- What is the largest category (by number of unique products in it)? +''' + +category = ''' +SELECT CategoryId, ProductName, COUNT (DISTINCT CategoryId) as total +FROM Product +''' +curs.execute(category) \ No newline at end of file diff --git a/Challenge/northwind_small.sqlite3 b/Challenge/northwind_small.sqlite3 new file mode 100644 index 00000000..3bd6af08 Binary files /dev/null and b/Challenge/northwind_small.sqlite3 differ diff --git a/Challenge/scratchpad.ipynb b/Challenge/scratchpad.ipynb new file mode 100644 index 00000000..a129aa0e --- /dev/null +++ b/Challenge/scratchpad.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "lmvpuzqe lmvpuzqe mY4V-4wY0JxKkXem9ELRV0jMJjmGKKyI otto.db.elephantsql.com\nCONNECTION: \nCURSOR: \n" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "import psycopg2\n", + "import pandas \n", + "\n", + "load_dotenv() #> loads contents of the .env file into the script's environment\n", + "\n", + "DB_NAME = os.getenv(\"DB_NAME\")\n", + "DB_USER = os.getenv(\"DB_USER\")\n", + "DB_PASSWORD = os.getenv(\"DB_PASSWORD\")\n", + "DB_HOST = os.getenv(\"DB_HOST\")\n", + "\n", + "print(DB_NAME, DB_USER, DB_PASSWORD, DB_HOST)\n", + "\n", + "connection = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST)\n", + "print(\"CONNECTION:\", connection)\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "sql_whatever = \"\"\"COPY titanic(survived,pclass,name,sex,age,siblings_spouses,parents_children,fare\n", + ") \n", + "FROM 'titanic.csv' with (format csv, header true, DELIMITER ',');\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "new_table =\"\"\" CREATE TABLE titanic2\n", + "(\n", + " survived INT,\n", + " pclass INT,\n", + " name VARCHAR(50),\n", + " sex VARCHAR(10),\n", + " age INT,\n", + " siblings_spouses INT,\n", + " parents_children INT,\n", + " fare FLOAT\n", + ");\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "new_query = \"CREATE TABLE titanic4 (Survived INT, Class INT, Name VARCHAR, Sex CHAR, Age FLOAT, Sibling_Spouse INT, Parent_Child INT, Fare FLOAT);\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "SyntaxError", + "evalue": "syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mSyntaxError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql_whatever\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mSyntaxError\u001b[0m: syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n" + ] + } + ], + "source": [ + "cursor.execute(sql_whatever)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute(new_query)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "connection.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "RESULT: \n[]\n" + } + ], + "source": [ + "cursor.execute('SELECT * from titanic3;')\n", + "result = cursor.fetchall()\n", + "print(\"RESULT:\", type(result))\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CURSOR: \n" + }, + { + "output_type": "error", + "ename": "DuplicateTable", + "evalue": "relation \"titanic3\" already exists\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mDuplicateTable\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# Skip the header row.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mreader\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m cursor.execute(new_query\n\u001b[0m\u001b[0;32m 10\u001b[0m )\n\u001b[0;32m 11\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mDuplicateTable\u001b[0m: relation \"titanic3\" already exists\n" + ] + } + ], + "source": [ + "import csv\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)\n", + "with open('titanic.csv', 'r') as f:\n", + " reader = csv.reader(f)\n", + " next(reader) # Skip the header row.\n", + " for row in reader:\n", + " cursor.execute(new_query\n", + " )\n", + "conn.commit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[(174,)]" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('rpg_db.sqlite3')\n", + "curs = conn.cursor()\n", + "\n", + "query = 'SELECT COUNT(*) FROM armory_item;'\n", + "curs.execute(query)\n", + "\n", + "curs.execute(query).fetchall()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "OperationalError", + "evalue": "table study_part1 already exists", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mstudents\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"CREATE TABLE study_part1 (student VARCHAR,studied VARCHAR,grade INT,age INT,sex VARCHAR);\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mquery\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstudents\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mOperationalError\u001b[0m: table study_part1 already exists" + ] + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('study_part1.sqlite3')\n", + "curs = conn.cursor()\n", + "students = \"CREATE TABLE study_part1 (student VARCHAR,studied VARCHAR,grade INT,age INT,sex VARCHAR);\"\n", + "query = students\n", + "curs.execute(query)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "fill = '''\n", + "INSERT INTO study_part1 (student, studied, grade, age, sex) \n", + "VALUES ('Lion-O', 'True', 85, 24, 'Male'),\n", + " ('Cheetara', 'True', 95, 22, 'Female'),\n", + " ('Mumm-Ra', 'False', 65, 153, 'Male'),\n", + " ('Snarf', 'False', 70, 15, 'Male'),\n", + " ('Panthro', 'True', 80, 30, 'Male');\n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[]" + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "curs.execute(fill)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'curs' is not defined", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mWHERE\u001b[0m \u001b[0msex\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;34m'Female'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m '''\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgender\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfetchall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'curs' is not defined" + ] + } + ], + "source": [ + "gender ='''SELECT student \n", + "FROM study_part1\n", + "WHERE sex= 'Female'\n", + "'''\n", + "curs.execute(gender).fetchall()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "import sqlite3\n", + "conn = sqlite3.connect('demo_data.sqlite3')\n", + "curs = conn.cursor()\n", + "create = \"CREATE TABLE demp(s VARCHAR,x INT, y INT);\"\n", + "query = create\n", + "curs.execute(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "fill = '''\n", + "INSERT INTO demp (s, x, y) \n", + "VALUES ('g', 3, 9),\n", + " ('v', 5, 7),\n", + " ('f', 8, 7)\n", + " ;\n", + " '''\n", + "curs.execute(fill)\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "None\n" + } + ], + "source": [ + "count_rows = '''SELECT COUNT(*) FROM demp'''\n", + "curs.execute(count_rows)\n", + "print(conn.commit())" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "None\n" + } + ], + "source": [ + "x_and_y='''\n", + "SELECT COUNT(*)\n", + "FROM demp\n", + "WHERE x>4 AND y>4\n", + "'''\n", + "curs.execute(x_and_y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "" + }, + "metadata": {}, + "execution_count": 48 + } + ], + "source": [ + "distincty = '''\n", + "SELECT COUNT (DISTINCT y) as d\n", + "FROM demp\n", + "'''\n", + "curs.execute(distincty)" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 00000000..b723d019 --- /dev/null +++ b/Pipfile @@ -0,0 +1,11 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] + +[requires] +python_version = "3.7" diff --git a/module1-introduction-to-sql/Chinook_Sqlite.sqlite b/module1-introduction-to-sql/Chinook_Sqlite.sqlite new file mode 100644 index 00000000..00a750d9 Binary files /dev/null and b/module1-introduction-to-sql/Chinook_Sqlite.sqlite differ diff --git a/module1-introduction-to-sql/Chinook_sqlite.sqlite3 b/module1-introduction-to-sql/Chinook_sqlite.sqlite3 new file mode 100644 index 00000000..e69de29b diff --git a/module1-introduction-to-sql/README.md b/module1-introduction-to-sql/README.md index 40497956..3a7afbdf 100644 --- a/module1-introduction-to-sql/README.md +++ b/module1-introduction-to-sql/README.md @@ -52,10 +52,23 @@ Use `sqlite3` to load and write queries to explore the data, and answer the following questions: - How many total Characters are there? + There are 302 characters. + - How many of each specific subclass? + There are 51 thief characters. + There are 75 cleric characters. + There are 68 fighter characters. + There are 108 mage characters with 11 being necromancer characters. + - How many total Items? + 172 + - How many of the Items are weapons? How many are not? + 37 are weapons, 135 are not + - How many Items does each character have? (Return first 20 rows) + + - How many Weapons does each character have? (Return first 20 rows) - On average, how many Items does each Character have? - On average, how many Weapons does each character have? diff --git a/module1-introduction-to-sql/study_part1.py b/module1-introduction-to-sql/study_part1.py new file mode 100644 index 00000000..e69de29b diff --git a/module1-introduction-to-sql/study_part1.sqlite3 b/module1-introduction-to-sql/study_part1.sqlite3 new file mode 100644 index 00000000..25e99a96 Binary files /dev/null and b/module1-introduction-to-sql/study_part1.sqlite3 differ diff --git a/module2-sql-for-analysis/elephant_queries.py b/module2-sql-for-analysis/elephant_queries.py new file mode 100644 index 00000000..2f4c7ec2 --- /dev/null +++ b/module2-sql-for-analysis/elephant_queries.py @@ -0,0 +1,25 @@ + +import os +from dotenv import load_dotenv +import psycopg2 +import pandas + +load_dotenv() #> loads contents of the .env file into the script's environment + +DB_NAME = os.getenv("DB_NAME") +DB_USER = os.getenv("DB_USER") +DB_PASSWORD = os.getenv("DB_PASSWORD") +DB_HOST = os.getenv("DB_HOST") + +print(DB_NAME, DB_USER, DB_PASSWORD, DB_HOST) + +connection = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST) +print("CONNECTION:", connection) + +cursor = connection.cursor() +print("CURSOR:", cursor) + +cursor.execute('SELECT * from test_table;') +result = cursor.fetchall() +print("RESULT:", type(result)) +print(result) \ No newline at end of file diff --git a/module2-sql-for-analysis/insert_titanic.py b/module2-sql-for-analysis/insert_titanic.py new file mode 100644 index 00000000..445a0a15 --- /dev/null +++ b/module2-sql-for-analysis/insert_titanic.py @@ -0,0 +1,8 @@ +import pandas as pd + +df = pd.read_csv('https://raw.githubusercontent.com/britneh/DS-Unit-3-Sprint-2-SQL-and-Databases/master/module2-sql-for-analysis/titanic.csv') + +print(df.head(10)) +print(df.shape) +import psycopg2 + diff --git a/module2-sql-for-analysis/practice.py b/module2-sql-for-analysis/practice.py new file mode 100644 index 00000000..615a6b85 --- /dev/null +++ b/module2-sql-for-analysis/practice.py @@ -0,0 +1 @@ +import sqlite3 diff --git a/module2-sql-for-analysis/rpg_db.sqlite3 b/module2-sql-for-analysis/rpg_db.sqlite3 new file mode 100644 index 00000000..e69de29b diff --git a/module2-sql-for-analysis/scratchpad.ipynb b/module2-sql-for-analysis/scratchpad.ipynb new file mode 100644 index 00000000..d7c93c28 --- /dev/null +++ b/module2-sql-for-analysis/scratchpad.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "lmvpuzqe lmvpuzqe mY4V-4wY0JxKkXem9ELRV0jMJjmGKKyI otto.db.elephantsql.com\nCONNECTION: \nCURSOR: \n" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "import psycopg2\n", + "import pandas \n", + "\n", + "load_dotenv() #> loads contents of the .env file into the script's environment\n", + "\n", + "DB_NAME = os.getenv(\"DB_NAME\")\n", + "DB_USER = os.getenv(\"DB_USER\")\n", + "DB_PASSWORD = os.getenv(\"DB_PASSWORD\")\n", + "DB_HOST = os.getenv(\"DB_HOST\")\n", + "\n", + "print(DB_NAME, DB_USER, DB_PASSWORD, DB_HOST)\n", + "\n", + "connection = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST)\n", + "print(\"CONNECTION:\", connection)\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "sql_whatever = \"\"\"COPY titanic(survived,pclass,name,sex,age,siblings_spouses,parents_children,fare\n", + ") \n", + "FROM 'titanic.csv' with (format csv, header true, DELIMITER ',');\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "new_table =\"\"\" CREATE TABLE titanic2\n", + "(\n", + " survived INT,\n", + " pclass INT,\n", + " name VARCHAR(50),\n", + " sex VARCHAR(10),\n", + " age INT,\n", + " siblings_spouses INT,\n", + " parents_children INT,\n", + " fare FLOAT\n", + ");\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "new_query = \"CREATE TABLE titanic7 (Survived INT, Class INT, Name VARCHAR, Sex CHAR, Age FLOAT, Sibling_Spouse INT, Parent_Child INT, Fare FLOAT);\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "SyntaxError", + "evalue": "syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mSyntaxError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql_whatever\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mSyntaxError\u001b[0m: syntax error at or near \"\\\"\nLINE 1: \\COPY titanic(survived,pclass,name,sex,age,siblings_spouses,...\n ^\n" + ] + } + ], + "source": [ + "cursor.execute(sql_whatever)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "cursor.execute(new_query)\n", + "connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "connection.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "RESULT: \n[]\n" + } + ], + "source": [ + "cursor.execute('SELECT * from titanic3;')\n", + "result = cursor.fetchall()\n", + "print(\"RESULT:\", type(result))\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CURSOR: \n" + }, + { + "output_type": "error", + "ename": "DuplicateTable", + "evalue": "relation \"titanic7\" already exists\n", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mDuplicateTable\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mreader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# Skip the header row.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m cursor.execute(new_query\n\u001b[0m\u001b[0;32m 9\u001b[0m )\n\u001b[0;32m 10\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mDuplicateTable\u001b[0m: relation \"titanic7\" already exists\n" + ] + } + ], + "source": [ + "import csv\n", + "\n", + "cursor = connection.cursor()\n", + "print(\"CURSOR:\", cursor)\n", + "with open('titanic.csv') as f:\n", + " reader = csv.reader(f)\n", + " next(reader) # Skip the header row.\n", + " cursor.execute(new_query\n", + " )\n", + "connection.commit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "OperationalError", + "evalue": "no such table: armory_weapon", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mcurs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mquery\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'SELECT COUNT(*) FROM armory_weapon;'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfetchall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mOperationalError\u001b[0m: no such table: armory_weapon" + ] + } + ], + "source": [ + "import sqlite3\n", + "filepath = 'rpg_db.sqlite3'\n", + "conn = sqlite3.connect(filepath)\n", + "curs = conn.cursor()\n", + "query = 'SELECT COUNT(*) FROM armory_weapon;'\n", + "curs.execute(query)\n", + "\n", + "curs.execute(query).fetchall()\n" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/module3-nosql-and-document-oriented-databases/mongo_queries.py b/module3-nosql-and-document-oriented-databases/mongo_queries.py new file mode 100644 index 00000000..e2a29e32 --- /dev/null +++ b/module3-nosql-and-document-oriented-databases/mongo_queries.py @@ -0,0 +1,59 @@ + +# app/mongo_queries.py + +from pymongo import MongoClient +import os +from dotenv import load_dotenv + +load_dotenv() + +DB_USER = os.getenv("MONGO_USER", default="OOPS") +DB_PASSWORD = os.getenv("MONGO_PASSWORD", default="OOPS") +CLUSTER_NAME = os.getenv("MONGO_CLUSTER_NAME", default="OOPS") + +connection_uri = f"mongodb+srv://{DB_USER}:{DB_PASSWORD}@{CLUSTER_NAME}.mongodb.net/test?retryWrites=true&w=majority" +print("----------------") +print("URI:", connection_uri) + +client = MongoClient("mongodb+srv://:@cluster0-6sjdp.mongodb.net/?retryWrites=true&w=majority") + +print("----------------") +print("CLIENT:", type(client), client) + +breakpoint() + +db = client.test_database # "test_database" or whatever you want to call it +print("----------------") +print("DB:", type(db), db) + +collection = db.pokemon_test # "pokemon_test" or whatever you want to call it +print("----------------") +print("COLLECTION:", type(collection), collection) + +print("----------------") +print("COLLECTIONS:") +print(db.list_collection_names()) + +collection.insert_one({ + "name": "Pikachu", + "level": 30, + "exp": 76000000000, + "hp": 400, +}) + +bulbasaur = { + "name": "Bulbasaur", + "type": "grass", + "moves":["Leech Seed", "Solar Beam"] +} + +eevee = { + "name": "Eevee", + "level": 40, + "exp": 7500, + "hp": 120, +} + +team = [bulbasaur, eevee] +print("DOCS:", collection.count_documents({})) +print(collection.count_documents({"name": "Pikachu"})) \ No newline at end of file