Contents

SQL in Python

SQLite

Based on docs.

1
2
3
4
5
6
7
8
9
import os
import sys

src_path = os.path.abspath(os.path.join(".."))
sys.path.append(src_path)
import sqlite3

import pandas as pd
from src import config
1
2
3
# parameters

SAMPLE = "777"
1
2
3
df = pd.read_parquet(os.path.join(config.TEMPDIR, f"data_{SAMPLE}.parquet"))
print(df.shape)
df.head(2)
(562996, 22)

user_idtransaction_dateamounttransaction_descriptionmerchant_nameauto_tagtagmanual_tagpostcodecredit_debit...account_createduser_registration_dateyear_of_birthaccount_idmerchant_business_linelatest_balancetransaction_idaccount_last_refreshedaccount_typegender
0607772014-11-27100.0xxxxxx xxxx5014 internet transferno merchanttransfersbroadbandno tagn16 0debit...2015-02-122014-05-231988.0378967personal0.0588664502017-04-04 07:33:00savingsm
1607772014-11-27-250.0<mdbremoved>no merchantsavings (general)savings (general)no tagn16 0credit...2015-02-122014-05-231988.0378968no merchant business line3000.0588663442017-04-04 07:33:00savingsm

2 rows × 22 columns

1
pd.read_sql("select * from pragma_table_info('outcomes')", conn).name.values
array(['user_id'], dtype=object)

Connect to database

1
2
3
db_path = os.path.join(config.DATADIR, f"{SAMPLE}.db")
conn = sqlite3.connect(db_path)
c = conn.cursor()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
def create_user_list(sample):
    """Create list of users of sample."""
    file_name = f'data_{sample}.parquet'
    file_path = os.path.join(config.TEMPDIR, file_name)
    df = pd.read_parquet(file_path)
    return df.user_id.unique()drop_duplicates().sort_values()

samples = ['XX7', 'X77', '777']
for sample in ['777']:
    create_user_list(sample).to_csv('/Users/fgu/tmp/test.csv', index=False)
1
2
path = os.path.join(config.DATADIR, "users_777.csv")
pd.read_csv(path)

Unnamed: 0user_id
00777
101777
271167777
308777
4122810777
.........
179129064578777
180133470579777
181135176582777
182136885586777
183139404587777

184 rows × 2 columns

Create tables

Create tables with user_ids

1
2
3
4
5
6
ids = pd.Series({"user_id": df.user_id.unique()})

tables = ["targets", "predictions", "outcomes"]
for table in tables:
    ids.to_sql(table, conn, index=False)
    conn.execute(f"create index idx_{table}_user_id on {table}(user_id)")
1
pd.Series({"user_id": df.user_id.unique()})
user_id    [60777, 64777, 777, 7777, 71777, 76777, 50777,...
dtype: object
1
pd.read_sql("select * from sqlite_master", conn)

typenametbl_namerootpagesql
0tabletargetstargets2CREATE TABLE "targets" (\n"user_id" INTEGER\n)
1indexidx_targets_user_idtargets3CREATE INDEX idx_targets_user_id on targets(us...
2tablepredictionspredictions4CREATE TABLE "predictions" (\n"user_id" INTEGE...
3indexidx_predictions_user_idpredictions5CREATE INDEX idx_predictions_user_id on predic...
4tableoutcomesoutcomes6CREATE TABLE "outcomes" (\n"user_id" INTEGER\n)
5indexidx_outcomes_user_idoutcomes7CREATE INDEX idx_outcomes_user_id on outcomes(...
1
pd.read_sql("select * from targets", conn)

user_id
0777
11777
27777
38777
410777
......
179578777
180579777
181582777
182586777
183587777

184 rows × 1 columns

Add tables

1
2
3
4
5
6
def db_tables():
    res = conn.execute("select name from sqlite_master where type = 'table'")
    return [r[0] for r in res.fetchall()]


db_tables()
['targets', 'predictions', 'outcomes', 'tmp']
1
2
3
4
5
6
def db_tables():
    query = "select name from sqlite_master where type = 'table'"
    return pd.read_sql(query, conn).name.values


db_tables()
array(['targets', 'predictions', 'outcomes', 'tmp'], dtype=object)
1
2
3
4
def add_table(table, table_name):
    """Add table to database."""
    if table_name not in db_tables():
        table.to_sql(table_name, conn, index=False)

Add columns

1
conn.execute("select name from sqlite_master").fetchall()
[('targets',), ('predictions',), ('outcomes',), ('tmp',)]
1
2
3
4
5
6
def tab_cols(table, conn):
    query = "select name from pragma_table_info(?)"
    return pd.read_sql(query, conn, params=(table,)).name.values


tab_cols("outcomes", conn)
array(['user_id', 'spendmax', 'spendmin', 'spendmean'], dtype=object)
1
2
3
4
5
6
7
def tab_cols(table):
    """List table columns."""
    res = c.execute("select name from pragma_table_info(?)", (table,))
    return [n[0] for n in res.fetchall()]


tab_cols("outcomes")
['user_id', 'spendmax', 'spendmin', 'spendmean']
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def add_column(df, table):
    """Add column to table.

    Input:
        pd.DataFrame with columns ['user_id', 'col_name'].
    """
    col_name = df.columns[1]
    if col_name not in tab_cols(table):
        df.to_sql("tmp", conn, index=False)
        conn.executescript(
            f"""
            alter table {table} add column {col_name};

            update {table}
            set {col_name} = (
                select {col_name} from tmp
                where {table}.user_id = tmp.user_id);

            drop table tmp;
            """
        )
1
2
def add_table(table):
    """Add table to database."""
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def spendmax(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.max())
        .rename("spendmax")
        .reset_index()
    )


def spendmin(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.min())
        .rename("spendmin")
        .reset_index()
    )


def spendmean(df):
    return (
        df.groupby("user_id")
        .apply(lambda u: u[u.amount > 0].amount.mean())
        .rename("spendmean")
        .reset_index()
    )
1
2
3
4
5
outcomes = [spendmax, spendmin, spendmean]

for outcome in outcomes:
    add_column(outcome(df), "outcomes")
    display(pd.read_sql("select * from outcomes", conn))

user_idspendmaxspendminspendmean
06077747885.4492190.0292.656956
16477710000.0000000.0151.317209
27774898.8798830.0154.275365
3777722300.0000000.07111.685793
4717774000.0000000.01146.197356
...............
17929977720265.0000000.1079.127393
180877714998.0000000.01378.858950
1818077717633.8007810.03130.221119
1828377735000.0000000.0186.623218
183867778000.0000000.0493.136604

184 rows × 4 columns

user_idspendmaxspendminspendmean
06077747885.4492190.0292.656956
16477710000.0000000.0151.317209
27774898.8798830.0154.275365
3777722300.0000000.07111.685793
4717774000.0000000.01146.197356
...............
17929977720265.0000000.1079.127393
180877714998.0000000.01378.858950
1818077717633.8007810.03130.221119
1828377735000.0000000.0186.623218
183867778000.0000000.0493.136604

184 rows × 4 columns

user_idspendmaxspendminspendmean
06077747885.4492190.0292.656956
16477710000.0000000.0151.317209
27774898.8798830.0154.275365
3777722300.0000000.07111.685793
4717774000.0000000.01146.197356
...............
17929977720265.0000000.1079.127393
180877714998.0000000.01378.858950
1818077717633.8007810.03130.221119
1828377735000.0000000.0186.623218
183867778000.0000000.0493.136604

184 rows × 4 columns

Connection info

PRAGMA is your friend for this and many other pieces of metadata.

List databases attached to the current connection

1
pd.read_sql_query("select * from pragma_database_list", conn)

seqnamefile
00main/Users/fgu/Library/Mobile Documents/com~apple~...

Database info

List tables attached to a database

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
pd.read_sql_query(
    """
    select
        *
    from
        sqlite_master
    where
        type = 'table' and
        name not like 'sqlite_%'
    """,
    conn,
)

typenametbl_namerootpagesql
0tabletargetstargets2CREATE TABLE targets(\n user_id int...
1tablepredictpredict3CREATE TABLE predict(\n user_id int...
2tableoutcomesoutcomes4CREATE TABLE outcomes(\n user_id in...

List indices attached to database

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
pd.read_sql_query(
    """
select
    *
from
    sqlite_master
where
    type = 'index'
""",
    conn,
)

typenametbl_namerootpagesql

Table info

List columns attached to a table

1
2
table = ("targets",)
pd.read_sql_query("select * from pragma_table_info(?)", conn, params=table)

cidnametypenotnulldflt_valuepk
00user_idinteger0None1

Get structure of a table

1
print(c.execute("select sql from sqlite_master where name = ?", table).fetchall()[0][0])
CREATE TABLE targets(
            user_id integer primary key
        )

Indices associated with a table

1
c.execute("pragma index_list('targets')").fetchall()
[]

Misc usefuls stuff

Move content of one table to another table

1
c.execute("""insert into new_table select * from old_table""")

Vacuum database regularly after altering tables or columns to free up overhead and reduce disk space.

1
c.execute("vacuum;")
<sqlite3.Cursor at 0x11c74a650>

Enable foreign key constraints

1
c.execute("select * from pragma_foreign_keys").fetchall()
[(0,)]
1
2
c.execute("pragma foreign_keys=on")
c.execute("select * from pragma_foreign_keys").fetchall()
[(1,)]
1
2
c.execute("pragma foreign_keys=off")
c.execute("select * from pragma_foreign_keys").fetchall()
[(0,)]

Use namedtuple()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from collections import namedtuple

TableInfo = namedtuple("TableInfo", "cid, name, type, notnull, dflt_value, pk")


def tab_cols(table):
    """List table columns."""
    raw_cols = c.execute("select * from pragma_table_info(?)", (table,)).fetchall()
    named_cols = map(TableInfo._make, raw_cols)
    return [(c.name, c.type, c.pk) for c in named_cols]


tab_cols("outcomes")
[('user_id', 'integer', 1)]

The above deliberately overuses namedtuple for practice.

Use namedtuple

This is useful, and we can guess that the second element in each tuple is the column name. But it would be nice to know what the remaining information is, and then to be able to refer to different pieces of information by their name.

To find out what each piece of information is, we can either check out the PRAGMA docs or, what I find even more useful, can use Pandas like so: (ideally, there would be a way to retrieve column names directly from the query, but I haven’t been able to find any way to do so.)

1
2
3
4
import pandas as pd

tabinf = pd.read_sql_query("select * from pragma_table_info('stocks')", conn)
tabinf

cidnametypenotnulldflt_valuepk
00nametext0None0
11quantityreal0None0
22pricereal0None0

To label the pieces in the table_cols function, we can store the column names and create a namedtuple() (docs).

1
2
3
from collections import namedtuple

TableInfo = namedtuple("TableInfo", "cid, name, type, notnull, dflt_value, pk")

Mapping each tuple in the list that table_info() returns to our named tuple, we get the following:

1
2
for a in map(TableInfo._make, table_cols(table)):
    print(a)
TableInfo(cid=0, name='name', type='text', notnull=0, dflt_value=None, pk=0)
TableInfo(cid=1, name='quantity', type='real', notnull=0, dflt_value=None, pk=0)
TableInfo(cid=2, name='price', type='real', notnull=0, dflt_value=None, pk=0)

Or, what we really want:

1
2
for a in map(TableInfo._make, table_cols(table)):
    print(a.name)
name
quantity
price

We can now update our table_cols function.

Adding a table with a row of data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# Create cursor
c = conn.cursor()

# Create table
c.execute(
    """CREATE TABLE stocks
             (date text, trans text, symbol text, qty real, price real)"""
)

# Insert a row of data
c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")

# Save (commit) changes
conn.commit()

# Close connection
conn.close()

To check that the database now contains our stocks table, list all its tables.

1
2
3
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("select name from sqlite_master where type = 'table'").fetchall()
[('stocks',)]

Retrieving data

1
2
3
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("SELECT * FROM stocks").fetchall()
[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

When adding Python variables to the query, never use string substitution directly like so:

1
2
symbol = "RHAT"
c.execute(f"select * from stocks where symbol = '{symbol}'").fetchall()
[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

While this works, it’s vulnerable to injection attacks. Use parameter substition instead. Either using question marks like so

1
2
symbol = ("RHAT",)
c.execute("select * from stocks where symbol = ?", symbol).fetchall()
[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

or using named placedholders like so

1
c.execute("select * from stocks where symbol = :symbol", {"symbol": "RHAT"}).fetchall()
[('2006-01-05', 'BUY', 'RHAT', 100.0, 35.14)]

Why do I need fetchall() after cursor.execute()?

Because the curse.execute() returns an iterater object containing all query results.

Using namedtuples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
EmployeeRecord = namedtuple("EmployeeRecord", "name, age, title, department, paygrade")

import csv

for emp in map(EmployeeRecord._make, csv.reader(open("employees.csv", "rb"))):
    print emp.name, emp.title

import sqlite3

conn = sqlite3.connect("/companydata")
cursor = conn.cursor()
cursor.execute("SELECT name, age, title, department, paygrade FROM employees")
for emp in map(EmployeeRecord._make, cursor.fetchall()):
    print emp.name, emp.title

Using Pandas

Pandas is a very handy way to interact with databased in Python, as it makes dumping and retrieving dataframes very easy.

1
2
3
import pandas as pd

pd.read_sql_query("SELECT * FROM stocks", conn)

datetranssymbolqtypricenewcol
02006-01-05BUYRHAT100.035.14None

SQLAlchemy

Summary of this video.

SQL best practices

  • Avoid * in queries to have full control of returned columns (e.g. in case where table changes).

Sources

=== Old notes to integrate ===

# Selecting rows from a table

SELECT col FROM table;
SELECT col1, col2 FROM table;
SELECT * FROM table LIMIT 10;
SELECT DISTINCT col_values FROM table;


# Counting

SELECT COUNT(*) FROM table;             # Count rows of table
SELECT COUNT(col) FROM table;           # Count non-missing values in col
SELECT COUNT(DISTINCT col) FROM table;  # Count distinct values in col


# Filtering

SELECT * FROM table WHERE col1 > 2010:  # Get rows for which col1 > 2010
SELECT COUNT(*) FROM table WHERE x < y  # Count number of rows for which x < y
SELECT * FROM table WHERE x > Y AND y < z
SELECT * FROM table WHERE x > Y OR y < z
SELECT * FROM table WHERE x BETWEEN a AND b     # between a and b inclusive
SELECT * FROM table WHERE x IN (a, b, c)

SELECT title FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');


# Filter based on results from aggregate function

SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;


# Missing values

SELECT COUNT(*)
FROM people
WHERE birthdate IS NULL;

SELECT name
FROM people
WHERE birthdate IS NOT NULL;


# Wildcards

SELECT name
FROM companies
WHERE name LIKE 'Data%';        # % matches zero, one, or many characters

SELECT name
FROM companies
WHERE name LIKE 'DataC_mp';     # _ matches exactly one character

SELECT name
FROM people
WHERE name NOT LIKE 'A%';


# Aggregate functions

SELECT AVG(budget)      # Also MAX, MIN, SUM,
FROM films;


# Aliasing

SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;


# Arithmetic

SELECT COUNT(deathdate) * 100.0 / COUNT(*) AS percentage_dead
FROM people


# Order by

SELECT title
FROM films
ORDER BY release_year;

SELECT title
FROM films
ORDER BY release_year DESC;


# Group by

SELECT sex, count(*)
FROM employees
GROUP BY sex;

SELECT release_year, MAX(budget)
FROM films
GROUP BY release_year;


# Building a database
#######################


# Create tables

CREATE TABLE professors (
 firstname text,
 lastname text
);


# Alter tables

ALTER TABLE table_name
ADD COLUMN column_name data_type;

ALTER TABLE table_name
DROP COLUMN column_name;

ALTER TABLE table_name
RENAME COLUMN old_name TO new_name;

DROP TABLE table_name


# Insert values

INSERT INTO transactions (transaction_date, amount, fee)
VALUES ('2018-09-24', 5454, '30');

SELECT transaction_date, amount + CAST(fee AS integer) AS net_amount
FROM transactions;

# Migrating data

INSERT INTO target_table
SELECT DISTINCT column_names
FROM source_table;


# Integrity constraints
# 1. Attribute constraints (data types)
# 2. Key constraints (primary keys)
# 3. Referential integrity constraints (enforced through foreign keys)


# Attribute constraints

ALTER TABLE professors
ALTER COLUMN firstname
TYPE varchar(16)
USING SUBSTRING(firstname FROM 1 FOR 16)

ALTER TABLE professors
ALTER COLUMN firstname
SET NOT NULL;

ALTER TABLE universities
ADD CONSTRAINT university_shortname_unq UNIQUE(university_shortname);


# Key constraints

# Superkey: each combination of attributes that identifies rows uniquely
# Candidate key: a superkey from which no column can be removed
# Primary key: one candidate key chosen to act as primary key
# Surrogate key: artificially created key (eg due to unsuitable candidate keys)
# Foreign keys: points to the primary key of another table


ALTER TABLE organizations
RENAME COLUMN organization TO id;
ALTER TABLE organizations
ADD CONSTRAINT organization_pk PRIMARY KEY (id);

ALTER TABLE affiliations
DROP CONSTRAINT affiliations_organizations_id_fkey;

ALTER TABLE professors
ADD COLUMN ID serial

UPDATE table_name
SET new_var = CONCAT(v1, v2);

-- Add a professor_id column that references id in professors table
ALTER TABLE affiliations
ADD COLUMN professor_id integer REFERENCES professors (id);
-- Rename the organization column to organization_id
ALTER TABLE affiliations
RENAME organization TO organization_id;
-- Add a foreign key on organization_id
ALTER TABLE affiliations
ADD CONSTRAINT affiliations_organization_fkey FOREIGN KEY (organization_id) REFERENCES organizations (id);

-- Update professor_id to professors.id where firstname, lastname correspond to rows in professors
UPDATE affiliations
SET professor_id = professors.id
FROM professors
WHERE affiliations.firstname = professors.firstname AND affiliations.lastname = professors.lastname;


# Referential integrity

CREATE TABLE a (
    id integer PRIMARY KEY,
    col_a, varchar(64),
    ...,
    b_id integer REFERENCES b (id) VIOLATION SETTING)
# Where violation setting is one of the following:
# ON DELETE NO ACTION:  Deleting id in b that's referenced in a throws error
# ON DELETE CASCADE:    Deleting id in b deletes references in all tables
# RESTRICT:             Similar to no action
# SET NULL              Set referencing column to null
# SET DEFAULT           Set referencing column to default


# Joins

SELECT table_a.column1, table_a.column2, table_b.column1, table_b.column2, ...
FROM table_a
JOIN table_b
ON table_a_foreign_key = table_b_primary_key
WHERE condition;