In [6]:
import pandas as pd
import pandasql as pdsql # pip install pandasql
In [67]:
%%writefile earthquake.csv
magnitude,year_2000,year_2001,year_2002,year_2003,year_2004,year_2005,year_2006,year_2007,year_2008,year_2009,year_2010,year_2011,year_2012,id
8.0 to 9.9,1.0,1.0,0.0,1.0,2.0,1.0,2.0,4.0,0.0,1.0,1.0,1.0,2.0,1
7.0 to 7.9,14.0,15.0,13.0,14.0,14.0,10.0,9.0,14.0,12.0,16.0,23.0,19.0,12.0,2
6.0 to 6.9,146.0,121.0,127.0,140.0,141.0,140.0,142.0,178.0,168.0,144.0,150.0,185.0,108.0,3
5.0 to 5.9,1344.0,1224.0,1201.0,1203.0,1515.0,1693.0,1712.0,2074.0,1768.0,1896.0,2209.0,2276.0,1401.0,4
4.0 to 4.9,8008.0,7991.0,8541.0,8462.0,10888.0,13917.0,12838.0,12078.0,12291.0,6805.0,10164.0,13315.0,9534.0,5
3.0 to 3.9,4827.0,6266.0,7068.0,7624.0,7932.0,9191.0,9990.0,9889.0,11735.0,2905.0,4341.0,2791.0,2453.0,6
2.0 to 2.9,3765.0,4164.0,6419.0,7727.0,6316.0,4636.0,4027.0,3597.0,3860.0,3014.0,4626.0,3643.0,3111.0,7
1.0 to 1.9,1026.0,944.0,1137.0,2506.0,1344.0,26.0,18.0,42.0,21.0,26.0,39.0,47.0,43.0,8
0.1 to 0.9,5.0,1.0,10.0,134.0,103.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,9
No Magnitude,3120.0,2807.0,2938.0,3608.0,2939.0,864.0,828.0,1807.0,1922.0,17.0,24.0,11.0,3.0,10
In [68]:
df = pd.read_csv('earthquake.csv')
df.head()
Out[68]:
In [70]:
# using pandas
melted = pd.melt(df, id_vars=['magnitude'],
value_vars=df.columns.difference(['magnitude', 'id']))
melted = melted.rename(columns={'variable': 'year'})
melted['year'] = melted['year'].str.replace('year_', '')
melted.head()
Out[70]:
In [10]:
# sql query pandas dataframes
pysqldf = lambda q: pdsql.sqldf(q, globals())
In [33]:
# run any valid sql query
pysqldf('select * from df limit(5)')
Out[33]:
In [66]:
# grab a number of hardcoded values
pysqldf("""
SELECT year
FROM
(
SELECT 2000 AS year
UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
) AS yearslist
""")
Out[66]:
In [75]:
# concatenate cross product of years and all df rows
pysqldf("""
select
years.year as year,
df.*
from df
cross join(
SELECT year
FROM
(
SELECT 2000 AS year
UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
) AS yearslist
) years
""").head()
Out[75]:
In [74]:
# get only the relevant data depending on the value of year
pysqldf("""
select
years.year as year,
df.magnitude as magnitude,
case year
when 2000 then year_2000
when 2001 then year_2001
when 2002 then year_2002
when 2003 then year_2003
when 2004 then year_2004
when 2005 then year_2005
when 2006 then year_2006
when 2007 then year_2007
when 2008 then year_2008
when 2009 then year_2009
when 2010 then year_2010
when 2011 then year_2011
when 2012 then year_2012
else null end
as number
from df
cross join(
SELECT year
FROM
(
SELECT 2000 AS year
UNION SELECT 2001 UNION SELECT 2002 UNION SELECT 2003
UNION SELECT 2004 UNION SELECT 2005 UNION SELECT 2006
UNION SELECT 2007 UNION SELECT 2008 UNION SELECT 2009
UNION SELECT 2010 UNION SELECT 2011 UNION SELECT 2012
) AS yearslist
) years
""").head()
Out[74]:
In [ ]: