Contents

Tidy data in Pandas

Based on excellent materials materials from Daniel Chen here and talk here

1
2
3
from imports import *
%load_ext autoreload
%autoreload 2

Creating tidy data

Definition by Hadley Wickham here:

  1. Each variable is a column
  2. Each observation is a row
  3. Each type of observational unit is a table.

Columns are values

1
2
pew = pd.read_csv('https://raw.githubusercontent.com/chendaniely/pydatadc_2018-tidy/master/data/pew.csv')
pew.head()

religion<$10k$10-20k$20-30k$30-40k$40-50k$50-75k$75-100k$100-150k>150kDon't know/refused
0Agnostic27346081761371221098496
1Atheist12273752357073597476
2Buddhist27213034335862395354
3Catholic41861773267063811169497926331489
4Don’t know/refused151415111035211718116
1
2
3
4
5
6
# Create a single income column
pew.melt(
    id_vars='religion',
    var_name='income',
    value_name='count'
).head()

religionincomecount
0Agnostic<$10k27
1Atheist<$10k12
2Buddhist<$10k27
3Catholic<$10k418
4Don’t know/refused<$10k15
1
2
billboard = pd.read_csv('https://raw.githubusercontent.com/chendaniely/pydatadc_2018-tidy/master/data/billboard.csv')
billboard.columns
Index(['year', 'artist', 'track', 'time', 'date.entered', 'wk1', 'wk2', 'wk3',
       'wk4', 'wk5', 'wk6', 'wk7', 'wk8', 'wk9', 'wk10', 'wk11', 'wk12',
       'wk13', 'wk14', 'wk15', 'wk16', 'wk17', 'wk18', 'wk19', 'wk20', 'wk21',
       'wk22', 'wk23', 'wk24', 'wk25', 'wk26', 'wk27', 'wk28', 'wk29', 'wk30',
       'wk31', 'wk32', 'wk33', 'wk34', 'wk35', 'wk36', 'wk37', 'wk38', 'wk39',
       'wk40', 'wk41', 'wk42', 'wk43', 'wk44', 'wk45', 'wk46', 'wk47', 'wk48',
       'wk49', 'wk50', 'wk51', 'wk52', 'wk53', 'wk54', 'wk55', 'wk56', 'wk57',
       'wk58', 'wk59', 'wk60', 'wk61', 'wk62', 'wk63', 'wk64', 'wk65', 'wk66',
       'wk67', 'wk68', 'wk69', 'wk70', 'wk71', 'wk72', 'wk73', 'wk74', 'wk75',
       'wk76'],
      dtype='object')
1
2
3
4
5
6
7
8
idvars = billboard.columns[~bb.columns.str.startswith('wk')]

tidy_billboard = billboard.melt(
    id_vars=idvars,
    var_name='week', 
    value_name='rating'
)
tidy_billboard.head()

yearartisttracktimedate.enteredweekrating
020002 PacBaby Don't Cry (Keep...4:222000-02-26wk187.0
120002Ge+herThe Hardest Part Of ...3:152000-09-02wk191.0
220003 Doors DownKryptonite3:532000-04-08wk181.0
320003 Doors DownLoser4:242000-10-21wk176.0
42000504 BoyzWobble Wobble3:352000-04-15wk157.0

Multiple variables stored in one column

1
2
ebola = pd.read_csv('https://raw.githubusercontent.com/chendaniely/pydatadc_2018-tidy/master/data/country_timeseries.csv')
ebola.head()

DateDayCases_GuineaCases_LiberiaCases_SierraLeoneCases_NigeriaCases_SenegalCases_UnitedStatesCases_SpainCases_MaliDeaths_GuineaDeaths_LiberiaDeaths_SierraLeoneDeaths_NigeriaDeaths_SenegalDeaths_UnitedStatesDeaths_SpainDeaths_Mali
01/5/20152892776.0NaN10030.0NaNNaNNaNNaNNaN1786.0NaN2977.0NaNNaNNaNNaNNaN
11/4/20152882775.0NaN9780.0NaNNaNNaNNaNNaN1781.0NaN2943.0NaNNaNNaNNaNNaN
21/3/20152872769.08166.09722.0NaNNaNNaNNaNNaN1767.03496.02915.0NaNNaNNaNNaNNaN
31/2/2015286NaN8157.0NaNNaNNaNNaNNaNNaNNaN3496.0NaNNaNNaNNaNNaNNaN
412/31/20142842730.08115.09633.0NaNNaNNaNNaNNaN1739.03471.02827.0NaNNaNNaNNaNNaN
1
2
3
4
5
6
# Tidying ebola step by step
tidy_ebola = ebola.melt(id_vars=['Date', 'Day'], value_name='Cases')
tidy_ebola[['Statistic', 'Country']] = (tidy_ebola.variable
                                        .str.split('_', expand=True))
tidy_ebola.drop('variable', axis=1, inplace=True)
tidy_ebola.head()

DateDayCasesStatisticCountry
01/5/20152892776.0CasesGuinea
11/4/20152882775.0CasesGuinea
21/3/20152872769.0CasesGuinea
31/2/2015286NaNCasesGuinea
412/31/20142842730.0CasesGuinea
1
2
3
4
5
6
7
# Using a pipeline
(ebola
 .melt(id_vars=['Date', 'Day'], value_name='Cases')
 .assign(Statistic = lambda df: df.variable.str.split('_', expand=True)[0])
 .assign(Country = lambda df: df.variable.str.split('_', expand=True)[1])
 .drop('variable', axis=1)
).head()

DateDayCasesStatisticCountry
01/5/20152892776.0CasesGuinea
11/4/20152882775.0CasesGuinea
21/3/20152872769.0CasesGuinea
31/2/2015286NaNCasesGuinea
412/31/20142842730.0CasesGuinea

Variables are stored in both rows and columns

1
2
weather = pd.read_csv('https://raw.githubusercontent.com/chendaniely/pydatadc_2018-tidy/master/data/weather.csv')
weather.head()

idyearmonthelementd1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29d30d31
0MX1700420101tmaxNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN27.8NaN
1MX1700420101tminNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN14.5NaN
2MX1700420102tmaxNaN27.324.1NaNNaNNaNNaNNaNNaNNaN29.7NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN29.9NaNNaNNaNNaNNaNNaNNaNNaN
3MX1700420102tminNaN14.414.4NaNNaNNaNNaNNaNNaNNaN13.4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN10.7NaNNaNNaNNaNNaNNaNNaNNaN
4MX1700420103tmaxNaNNaNNaNNaN32.1NaNNaNNaNNaN34.5NaNNaNNaNNaNNaN31.1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
(weather
 .melt(
     id_vars=['id', 'year', 'month', 'element'],
     var_name='day',
     value_name='temp'
 )
 .pivot_table(
     index=['id', 'year', 'month', 'day'],
     columns='element',
     values='temp'
 )
 .reset_index()
 .assign(day = lambda df: df.day.str.extract('(\d+)'))
).head()

elementidyearmonthdaytmaxtmin
0MX17004201013027.814.5
1MX17004201021129.713.4
2MX1700420102227.314.4
3MX17004201022329.910.7
4MX1700420102324.114.4

Multiple types of observational units are stored in a single table

1
2
3
tidy_billboard.head()
tidy_bb = tidy_billboard
tidy_bb.head()

yearartisttracktimedate.enteredweekrating
020002 PacBaby Don't Cry (Keep...4:222000-02-26wk187.0
120002Ge+herThe Hardest Part Of ...3:152000-09-02wk191.0
220003 Doors DownKryptonite3:532000-04-08wk181.0
320003 Doors DownLoser4:242000-10-21wk176.0
42000504 BoyzWobble Wobble3:352000-04-15wk157.0
1
2
3
4
5
6
bb_songs = (
    tidy_bb[['year', 'artist', 'track', 'time', 'date.entered']]
    .drop_duplicates()
    .assign(id = lambda df: range(len(df)))
)
bb_songs.head()

yearartisttracktimedate.enteredid
020002 PacBaby Don't Cry (Keep...4:222000-02-260
120002Ge+herThe Hardest Part Of ...3:152000-09-021
220003 Doors DownKryptonite3:532000-04-082
320003 Doors DownLoser4:242000-10-213
42000504 BoyzWobble Wobble3:352000-04-154
1
2
3
4
5
6
bb_ratings = (
    tidy_bb
    .merge(bb_songs)
    .loc[:, ['week', 'rating', 'id']]
)
bb_ratings.head()

weekratingid
0wk187.00
1wk282.00
2wk372.00
3wk477.00
4wk587.00