Pivoting columns to rows


In [6]:
import pandas as pd
import pandasql as pdsql  # pip install pandasql

In [67]:
%%writefile earthquake.csv
magnitude,year_2000,year_2001,year_2002,year_2003,year_2004,year_2005,year_2006,year_2007,year_2008,year_2009,year_2010,year_2011,year_2012,id
8.0 to 9.9,1.0,1.0,0.0,1.0,2.0,1.0,2.0,4.0,0.0,1.0,1.0,1.0,2.0,1
7.0 to 7.9,14.0,15.0,13.0,14.0,14.0,10.0,9.0,14.0,12.0,16.0,23.0,19.0,12.0,2
6.0 to 6.9,146.0,121.0,127.0,140.0,141.0,140.0,142.0,178.0,168.0,144.0,150.0,185.0,108.0,3
5.0 to 5.9,1344.0,1224.0,1201.0,1203.0,1515.0,1693.0,1712.0,2074.0,1768.0,1896.0,2209.0,2276.0,1401.0,4
4.0 to 4.9,8008.0,7991.0,8541.0,8462.0,10888.0,13917.0,12838.0,12078.0,12291.0,6805.0,10164.0,13315.0,9534.0,5
3.0 to 3.9,4827.0,6266.0,7068.0,7624.0,7932.0,9191.0,9990.0,9889.0,11735.0,2905.0,4341.0,2791.0,2453.0,6
2.0 to 2.9,3765.0,4164.0,6419.0,7727.0,6316.0,4636.0,4027.0,3597.0,3860.0,3014.0,4626.0,3643.0,3111.0,7
1.0 to 1.9,1026.0,944.0,1137.0,2506.0,1344.0,26.0,18.0,42.0,21.0,26.0,39.0,47.0,43.0,8
0.1 to 0.9,5.0,1.0,10.0,134.0,103.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,9
No Magnitude,3120.0,2807.0,2938.0,3608.0,2939.0,864.0,828.0,1807.0,1922.0,17.0,24.0,11.0,3.0,10


Writing earthquake.csv

In [68]:
df = pd.read_csv('earthquake.csv')
df.head()


Out[68]:
magnitude year_2000 year_2001 year_2002 year_2003 year_2004 year_2005 year_2006 year_2007 year_2008 year_2009 year_2010 year_2011 year_2012 id
0 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
1 7.0 to 7.9 14.0 15.0 13.0 14.0 14.0 10.0 9.0 14.0 12.0 16.0 23.0 19.0 12.0 2
2 6.0 to 6.9 146.0 121.0 127.0 140.0 141.0 140.0 142.0 178.0 168.0 144.0 150.0 185.0 108.0 3
3 5.0 to 5.9 1344.0 1224.0 1201.0 1203.0 1515.0 1693.0 1712.0 2074.0 1768.0 1896.0 2209.0 2276.0 1401.0 4
4 4.0 to 4.9 8008.0 7991.0 8541.0 8462.0 10888.0 13917.0 12838.0 12078.0 12291.0 6805.0 10164.0 13315.0 9534.0 5

In [70]:
# using pandas
melted = pd.melt(df, id_vars=['magnitude'],
                 value_vars=df.columns.difference(['magnitude', 'id']))
melted = melted.rename(columns={'variable': 'year'})
melted['year'] = melted['year'].str.replace('year_', '')
melted.head()


Out[70]:
magnitude year value
0 8.0 to 9.9 2000 1.0
1 7.0 to 7.9 2000 14.0
2 6.0 to 6.9 2000 146.0
3 5.0 to 5.9 2000 1344.0
4 4.0 to 4.9 2000 8008.0

SQL


In [10]:
# sql query pandas dataframes
pysqldf = lambda q: pdsql.sqldf(q, globals())

In [33]:
# run any valid sql query
pysqldf('select * from df limit(5)')


Out[33]:
magnitude year_2000 year_2001 year_2002 year_2003 year_2004 year_2005 year_2006 year_2007 year_2008 year_2009 year_2010 year_2011 year_2012 id
0 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
1 7.0 to 7.9 14.0 15.0 13.0 14.0 14.0 10.0 9.0 14.0 12.0 16.0 23.0 19.0 12.0 2
2 6.0 to 6.9 146.0 121.0 127.0 140.0 141.0 140.0 142.0 178.0 168.0 144.0 150.0 185.0 108.0 3
3 5.0 to 5.9 1344.0 1224.0 1201.0 1203.0 1515.0 1693.0 1712.0 2074.0 1768.0 1896.0 2209.0 2276.0 1401.0 4
4 4.0 to 4.9 8008.0 7991.0 8541.0 8462.0 10888.0 13917.0 12838.0 12078.0 12291.0 6805.0 10164.0 13315.0 9534.0 5

In [66]:
# grab a number of hardcoded values
pysqldf("""
SELECT year
    FROM 
    (
        SELECT 2000 AS year
        UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
        UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
        UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
        UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
    ) AS yearslist
""")


Out[66]:
year
0 2000
1 2001
2 2002
3 2003
4 2004
5 2005
6 2006
7 2007
8 2008
9 2009
10 2010
11 2011
12 2012

In [75]:
# concatenate cross product of years and all df rows
pysqldf("""
select
    years.year as year,
    df.*
from df
cross join(
    SELECT year
    FROM 
    (
        SELECT 2000 AS year
        UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
        UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
        UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
        UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
    ) AS yearslist
    ) years
""").head()


Out[75]:
year magnitude year_2000 year_2001 year_2002 year_2003 year_2004 year_2005 year_2006 year_2007 year_2008 year_2009 year_2010 year_2011 year_2012 id
0 2000 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
1 2001 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
2 2002 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
3 2003 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1
4 2004 8.0 to 9.9 1.0 1.0 0.0 1.0 2.0 1.0 2.0 4.0 0.0 1.0 1.0 1.0 2.0 1

In [74]:
# get only the relevant data depending on the value of year
pysqldf("""
select
    years.year as year,
    df.magnitude as magnitude,
    case year
        when 2000 then year_2000
        when 2001 then year_2001
        when 2002 then year_2002
        when 2003 then year_2003
        when 2004 then year_2004
        when 2005 then year_2005
        when 2006 then year_2006
        when 2007 then year_2007
        when 2008 then year_2008
        when 2009 then year_2009
        when 2010 then year_2010
        when 2011 then year_2011
        when 2012 then year_2012
        else null end
        as number
from df
cross join(
    SELECT year
    FROM 
    (
        SELECT 2000 AS year
        UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
        UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
        UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
        UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
    ) AS yearslist
    ) years
""").head()


Out[74]:
year magnitude number
0 2000 8.0 to 9.9 1.0
1 2001 8.0 to 9.9 1.0
2 2002 8.0 to 9.9 0.0
3 2003 8.0 to 9.9 1.0
4 2004 8.0 to 9.9 2.0

In [ ]: