SQL in Python

2020-09-12 2590 words 13 minutes

SQLite

Based on docs.

1
2
3
4
5
6
7
8
9
import os
import sys

src_path = os.path.abspath(os.path.join(".."))
sys.path.append(src_path)
import sqlite3

import pandas as pd
from src import config

1
2
3
# parameters

SAMPLE = "777"

1
2
3
df = pd.read_parquet(os.path.join(config.TEMPDIR, f"data_{SAMPLE}.parquet"))
print(df.shape)
df.head(2)

(562996, 22)

	user_id	transaction_date	amount	transaction_description	merchant_name	auto_tag	tag	manual_tag	postcode	credit_debit	...	account_created	user_registration_date	year_of_birth	account_id	merchant_business_line	latest_balance	transaction_id	account_last_refreshed	account_type	gender
0	60777	2014-11-27	100.0	xxxxxx xxxx5014 internet transfer	no merchant	transfers	broadband	no tag	n16 0	debit	...	2015-02-12	2014-05-23	1988.0	378967	personal	0.0	58866450	2017-04-04 07:33:00	savings	m
1	60777	2014-11-27	-250.0	<mdbremoved>	no merchant	savings (general)	savings (general)	no tag	n16 0	credit	...	2015-02-12	2014-05-23	1988.0	378968	no merchant business line	3000.0	58866344	2017-04-04 07:33:00	savings	m

2 rows × 22 columns

1
pd.read_sql("select * from pragma_table_info('outcomes')", conn).name.values

array(['user_id'], dtype=object)

Connect to database

1
2
3
db_path = os.path.join(config.DATADIR, f"{SAMPLE}.db")
conn = sqlite3.connect(db_path)
c = conn.cursor()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
def create_user_list(sample):
    """Create list of users of sample."""
    file_name = f'data_{sample}.parquet'
    file_path = os.path.join(config.TEMPDIR, file_name)
    df = pd.read_parquet(file_path)
    return df.user_id.unique()drop_duplicates().sort_values()

samples = ['XX7', 'X77', '777']
for sample in ['777']:
    create_user_list(sample).to_csv('/Users/fgu/tmp/test.csv', index=False)

1
2
path = os.path.join(config.DATADIR, "users_777.csv")
pd.read_csv(path)

	Unnamed: 0	user_id
0	0	777
1	0	1777
2	7116	7777
3	0	8777
4	1228	10777
...	...	...
179	129064	578777
180	133470	579777
181	135176	582777
182	136885	586777
183	139404	587777

184 rows × 2 columns

Create tables

Create tables with user_ids

1
2
3
4
5
6
ids = pd.Series({"user_id": df.user_id.unique()})

tables = ["targets", "predictions", "outcomes"]
for table in tables:
    ids.to_sql(table, conn, index=False)
    conn.execute(f"create index idx_{table}_user_id on {table}(user_id)")

1
pd.Series({"user_id": df.user_id.unique()})

user_id    [60777, 64777, 777, 7777, 71777, 76777, 50777,...
dtype: object

1
pd.read_sql("select * from sqlite_master", conn)

	type	name	tbl_name	rootpage	sql
0	table	targets	targets	2	CREATE TABLE "targets" (\n"user_id" INTEGER\n)
1	index	idx_targets_user_id	targets	3	CREATE INDEX idx_targets_user_id on targets(us...
2	table	predictions	predictions	4	CREATE TABLE "predictions" (\n"user_id" INTEGE...
3	index	idx_predictions_user_id	predictions	5	CREATE INDEX idx_predictions_user_id on predic...
4	table	outcomes	outcomes	6	CREATE TABLE "outcomes" (\n"user_id" INTEGER\n)
5	index	idx_outcomes_user_id	outcomes	7	CREATE INDEX idx_outcomes_user_id on outcomes(...

1
pd.read_sql("select * from targets", conn)

	user_id
0	777
1	1777
2	7777
3	8777
4	10777
...	...
179	578777
180	579777
181	582777
182	586777
183	587777

184 rows × 1 columns

Add tables

1
2
3
4
5
6
def db_tables():
    res = conn.execute("select name from sqlite_master where type = 'table'")
    return [r[0] for r in res.fetchall()]


db_tables()

['targets', 'predictions', 'outcomes', 'tmp']

1
2
3
4
5
6
def db_tables():
    query = "select name from sqlite_master where type = 'table'"
    return pd.read_sql(query, conn).name.values


db_tables()

array(['targets', 'predictions', 'outcomes', 'tmp'], dtype=object)

1
2
3
4
def add_table(table, table_name):
    """Add table to database."""
    if table_name not in db_tables():
        table.to_sql(table_name, conn, index=False)

Add columns

1
conn.execute("select name from sqlite_master").fetchall()

[('targets',), ('predictions',), ('outcomes',), ('tmp',)]

1
2
3
4
5
6
def tab_cols(table, conn):
    query = "select name from pragma_table_info(?)"
    return pd.read_sql(query, conn, params=(table,)).name.values


tab_cols("outcomes", conn)

array(['user_id', 'spendmax', 'spendmin', 'spendmean'], dtype=object)

1
2
3
4
5
6
7
def tab_cols(table):
    """List table columns."""
    res = c.execute("select name from pragma_table_info(?)", (table,))
    return [n[0] for n in res.fetchall()]


tab_cols("outcomes")

['user_id', 'spendmax', 'spendmin', 'spendmean']

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def add_column(df, table):
    """Add column to table.

    Input:
        pd.DataFrame with columns ['user_id', 'col_name'].
    """
    col_name = df.columns[1]
    if col_name not in tab_cols(table):
        df.to_sql("tmp", conn, index=False)
        conn.executescript(
            f"""
            alter table {table} add column {col_name};

            update {table}
            set {col_name} = (
                select {col_name} from tmp
                where {table}.user_id = tmp.user_id);

            drop table tmp;
            """
        )

1
2
def add_table(table):
    """Add table to database."""

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def spendmax(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.max())
        .rename("spendmax")
        .reset_index()
    )


def spendmin(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.min())
        .rename("spendmin")
        .reset_index()
    )


def spendmean(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.mean())
        .rename("spendmean")
        .reset_index()
    )

1
2
3
4
5
outcomes = [spendmax, spendmin, spendmean]

for outcome in outcomes:
    add_column(outcome(df), "outcomes")
    display(pd.read_sql("select * from outcomes", conn))

	user_id	spendmax	spendmin	spendmean
0	60777	47885.449219	0.02	92.656956
1	64777	10000.000000	0.01	51.317209
2	777	4898.879883	0.01	54.275365
3	7777	22300.000000	0.07	111.685793
4	71777	4000.000000	0.01	146.197356
...	...	...	...	...
179	299777	20265.000000	0.10	79.127393
180	8777	14998.000000	0.01	378.858950
181	80777	17633.800781	0.03	130.221119
182	83777	35000.000000	0.01	86.623218
183	86777	8000.000000	0.04	93.136604

184 rows × 4 columns

	user_id	spendmax	spendmin	spendmean
0	60777	47885.449219	0.02	92.656956
1	64777	10000.000000	0.01	51.317209
2	777	4898.879883	0.01	54.275365
3	7777	22300.000000	0.07	111.685793
4	71777	4000.000000	0.01	146.197356
...	...	...	...	...
179	299777	20265.000000	0.10	79.127393
180	8777	14998.000000	0.01	378.858950
181	80777	17633.800781	0.03	130.221119
182	83777	35000.000000	0.01	86.623218
183	86777	8000.000000	0.04	93.136604

184 rows × 4 columns

	user_id	spendmax	spendmin	spendmean
0	60777	47885.449219	0.02	92.656956
1	64777	10000.000000	0.01	51.317209
2	777	4898.879883	0.01	54.275365
3	7777	22300.000000	0.07	111.685793
4	71777	4000.000000	0.01	146.197356
...	...	...	...	...
179	299777	20265.000000	0.10	79.127393
180	8777	14998.000000	0.01	378.858950
181	80777	17633.800781	0.03	130.221119
182	83777	35000.000000	0.01	86.623218
183	86777	8000.000000	0.04	93.136604

184 rows × 4 columns

Connection info

PRAGMA is your friend for this and many other pieces of metadata.

List databases attached to the current connection

1
pd.read_sql_query("select * from pragma_database_list", conn)

	seq	name	file
0	0	main	/Users/fgu/Library/Mobile Documents/com~apple~...

Database info

List tables attached to a database

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
pd.read_sql_query(
    """
    select
        *
    from
        sqlite_master
    where
        type = 'table' and
        name not like 'sqlite_%'
    """,
    conn,
)

	type	name	tbl_name	rootpage	sql
0	table	targets	targets	2	CREATE TABLE targets(\n user_id int...
1	table	predict	predict	3	CREATE TABLE predict(\n user_id int...
2	table	outcomes	outcomes	4	CREATE TABLE outcomes(\n user_id in...

List indices attached to database

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
pd.read_sql_query(
    """
select
    *
from
    sqlite_master
where
    type = 'index'
""",
    conn,
)

	type	name	tbl_name	rootpage	sql

Table info

List columns attached to a table

1
2
table = ("targets",)
pd.read_sql_query("select * from pragma_table_info(?)", conn, params=table)

	cid	name	type	notnull	dflt_value	pk
0	0	user_id	integer	0	None	1

Get structure of a table

1
print(c.execute("select sql from sqlite_master where name = ?", table).fetchall()[0][0])

CREATE TABLE targets(
            user_id integer primary key
        )

Indices associated with a table

1
c.execute("pragma index_list('targets')").fetchall()

[]

Misc usefuls stuff

Move content of one table to another table

1
c.execute("""insert into new_table select * from old_table""")

Vacuum database regularly after altering tables or columns to free up overhead and reduce disk space.

1
c.execute("vacuum;")

<sqlite3.Cursor at 0x11c74a650>

Enable foreign key constraints

1
c.execute("select * from pragma_foreign_keys").fetchall()

[(0,)]

1
2
c.execute("pragma foreign_keys=on")
c.execute("select * from pragma_foreign_keys").fetchall()

[(1,)]

1
2
c.execute("pragma foreign_keys=off")
c.execute("select * from pragma_foreign_keys").fetchall()

[(0,)]

Use `namedtuple()`

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from collections import namedtuple

TableInfo = namedtuple("TableInfo", "cid, name, type, notnull, dflt_value, pk")


def tab_cols(table):
    """List table columns."""
    raw_cols = c.execute("select * from pragma_table_info(?)", (table,)).fetchall()
    named_cols = map(TableInfo._make, raw_cols)
    return [(c.name, c.type, c.pk) for c in named_cols]


tab_cols("outcomes")

[('user_id', 'integer', 1)]

The above deliberately overuses namedtuple for practice.

Use namedtuple

This is useful, and we can guess that the second element in each tuple is the column name. But it would be nice to know what the remaining information is, and then to be able to refer to different pieces of information by their name.

To find out what each piece of information is, we can either check out the PRAGMA docs or, what I find even more useful, can use Pandas like so: (ideally, there would be a way to retrieve column names directly from the query, but I haven’t been able to find any way to do so.)

1
2
3
4
import pandas as pd

tabinf = pd.read_sql_query("select * from pragma_table_info('stocks')", conn)
tabinf

	cid	name	type	dflt_value
0	0	name	text	None
1	1	quantity	real	None
2	2	price	real	None

To label the pieces in the table_cols function, we can store the column names and create a namedtuple() (docs).

1
2
3
from collections import namedtuple

TableInfo = namedtuple("TableInfo", "cid, name, type, notnull, dflt_value, pk")

Mapping each tuple in the list that table_info() returns to our named tuple, we get the following:

1
2
for a in map(TableInfo._make, table_cols(table)):
    print(a)

TableInfo(cid=0, name='name', type='text', notnull=0, dflt_value=None, pk=0)
TableInfo(cid=1, name='quantity', type='real', notnull=0, dflt_value=None, pk=0)
TableInfo(cid=2, name='price', type='real', notnull=0, dflt_value=None, pk=0)

Or, what we really want:

1
2
for a in map(TableInfo._make, table_cols(table)):
    print(a.name)

name
quantity
price

We can now update our table_cols function.

Adding a table with a row of data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# Create cursor
c = conn.cursor()

# Create table
c.execute(
    """CREATE TABLE stocks
             (date text, trans text, symbol text, qty real, price real)"""
)

# Insert a row of data
c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")

# Save (commit) changes
conn.commit()

# Close connection
conn.close()

To check that the database now contains our stocks table, list all its tables.

1
2
3
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("select name from sqlite_master where type = 'table'").fetchall()

[('stocks',)]

Retrieving data

1
2
3
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("SELECT * FROM stocks").fetchall()

[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

When adding Python variables to the query, never use string substitution directly like so:

1
2
symbol = "RHAT"
c.execute(f"select * from stocks where symbol = '{symbol}'").fetchall()

[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

While this works, it’s vulnerable to injection attacks. Use parameter substition instead. Either using question marks like so

1
2
symbol = ("RHAT",)
c.execute("select * from stocks where symbol = ?", symbol).fetchall()

[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

or using named placedholders like so

1
c.execute("select * from stocks where symbol = :symbol", {"symbol": "RHAT"}).fetchall()

[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

Why do I need `fetchall()` after `cursor.execute()`?

Because the curse.execute() returns an iterater object containing all query results.

Using namedtuples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
EmployeeRecord = namedtuple("EmployeeRecord", "name, age, title, department, paygrade")

import csv

for emp in map(EmployeeRecord._make, csv.reader(open("employees.csv", "rb"))):
    print emp.name, emp.title

import sqlite3

conn = sqlite3.connect("/companydata")
cursor = conn.cursor()
cursor.execute("SELECT name, age, title, department, paygrade FROM employees")
for emp in map(EmployeeRecord._make, cursor.fetchall()):
    print emp.name, emp.title

Using Pandas

Pandas is a very handy way to interact with databased in Python, as it makes dumping and retrieving dataframes very easy.

1
2
3
import pandas as pd

pd.read_sql_query("SELECT * FROM stocks", conn)

	date	trans	symbol	qty	price	newcol
0	2006-01-05	BUY	RHAT	100.0	35.14	None

SQLAlchemy

Summary of this video.

SQL best practices

Avoid * in queries to have full control of returned columns (e.g. in case where table changes).

Sources

=== Old notes to integrate ===

# Selecting rows from a table

SELECT col FROM table;
SELECT col1, col2 FROM table;
SELECT * FROM table LIMIT 10;
SELECT DISTINCT col_values FROM table;


# Counting

SELECT COUNT(*) FROM table;             # Count rows of table
SELECT COUNT(col) FROM table;           # Count non-missing values in col
SELECT COUNT(DISTINCT col) FROM table;  # Count distinct values in col


# Filtering

SELECT * FROM table WHERE col1 > 2010:  # Get rows for which col1 > 2010
SELECT COUNT(*) FROM table WHERE x < y  # Count number of rows for which x < y
SELECT * FROM table WHERE x > Y AND y < z
SELECT * FROM table WHERE x > Y OR y < z
SELECT * FROM table WHERE x BETWEEN a AND b     # between a and b inclusive
SELECT * FROM table WHERE x IN (a, b, c)

SELECT title FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');


# Filter based on results from aggregate function

SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;


# Missing values

SELECT COUNT(*)
FROM people
WHERE birthdate IS NULL;

SELECT name
FROM people
WHERE birthdate IS NOT NULL;


# Wildcards

SELECT name
FROM companies
WHERE name LIKE 'Data%';        # % matches zero, one, or many characters

SELECT name
FROM companies
WHERE name LIKE 'DataC_mp';     # _ matches exactly one character

SELECT name
FROM people
WHERE name NOT LIKE 'A%';


# Aggregate functions

SELECT AVG(budget)      # Also MAX, MIN, SUM,
FROM films;


# Aliasing

SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;


# Arithmetic

SELECT COUNT(deathdate) * 100.0 / COUNT(*) AS percentage_dead
FROM people


# Order by

SELECT title
FROM films
ORDER BY release_year;

SELECT title
FROM films
ORDER BY release_year DESC;


# Group by

SELECT sex, count(*)
FROM employees
GROUP BY sex;

SELECT release_year, MAX(budget)
FROM films
GROUP BY release_year;


# Building a database
#######################


# Create tables

CREATE TABLE professors (
 firstname text,
 lastname text
);


# Alter tables

ALTER TABLE table_name
ADD COLUMN column_name data_type;

ALTER TABLE table_name
DROP COLUMN column_name;

ALTER TABLE table_name
RENAME COLUMN old_name TO new_name;

DROP TABLE table_name


# Insert values

INSERT INTO transactions (transaction_date, amount, fee)
VALUES ('2018-09-24', 5454, '30');

SELECT transaction_date, amount + CAST(fee AS integer) AS net_amount
FROM transactions;

# Migrating data

INSERT INTO target_table
SELECT DISTINCT column_names
FROM source_table;


# Integrity constraints
# 1. Attribute constraints (data types)
# 2. Key constraints (primary keys)
# 3. Referential integrity constraints (enforced through foreign keys)


# Attribute constraints

ALTER TABLE professors
ALTER COLUMN firstname
TYPE varchar(16)
USING SUBSTRING(firstname FROM 1 FOR 16)

ALTER TABLE professors
ALTER COLUMN firstname
SET NOT NULL;

ALTER TABLE universities
ADD CONSTRAINT university_shortname_unq UNIQUE(university_shortname);


# Key constraints

# Superkey: each combination of attributes that identifies rows uniquely
# Candidate key: a superkey from which no column can be removed
# Primary key: one candidate key chosen to act as primary key
# Surrogate key: artificially created key (eg due to unsuitable candidate keys)
# Foreign keys: points to the primary key of another table


ALTER TABLE organizations
RENAME COLUMN organization TO id;
ALTER TABLE organizations
ADD CONSTRAINT organization_pk PRIMARY KEY (id);

ALTER TABLE affiliations
DROP CONSTRAINT affiliations_organizations_id_fkey;

ALTER TABLE professors
ADD COLUMN ID serial

UPDATE table_name
SET new_var = CONCAT(v1, v2);

-- Add a professor_id column that references id in professors table
ALTER TABLE affiliations
ADD COLUMN professor_id integer REFERENCES professors (id);
-- Rename the organization column to organization_id
ALTER TABLE affiliations
RENAME organization TO organization_id;
-- Add a foreign key on organization_id
ALTER TABLE affiliations
ADD CONSTRAINT affiliations_organization_fkey FOREIGN KEY (organization_id) REFERENCES organizations (id);

-- Update professor_id to professors.id where firstname, lastname correspond to rows in professors
UPDATE affiliations
SET professor_id = professors.id
FROM professors
WHERE affiliations.firstname = professors.firstname AND affiliations.lastname = professors.lastname;


# Referential integrity

CREATE TABLE a (
    id integer PRIMARY KEY,
    col_a, varchar(64),
    ...,
    b_id integer REFERENCES b (id) VIOLATION SETTING)
# Where violation setting is one of the following:
# ON DELETE NO ACTION:  Deleting id in b that's referenced in a throws error
# ON DELETE CASCADE:    Deleting id in b deletes references in all tables
# RESTRICT:             Similar to no action
# SET NULL              Set referencing column to null
# SET DEFAULT           Set referencing column to default


# Joins

SELECT table_a.column1, table_a.column2, table_b.column1, table_b.column2, ...
FROM table_a
JOIN table_b
ON table_a_foreign_key = table_b_primary_key
WHERE condition;

Contents

SQL in Python

SQLite

Connect to database

Create tables

Add tables

Add columns

Connection info

Database info

Table info

Misc usefuls stuff

Enable foreign key constraints

Use namedtuple()

Use namedtuple

Adding a table with a row of data

Retrieving data

Why do I need fetchall() after cursor.execute()?

Using namedtuples

Using Pandas

SQLAlchemy

SQL best practices

Sources

=== Old notes to integrate ===

Use `namedtuple()`

Why do I need `fetchall()` after `cursor.execute()`?