In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
In [2]:
# this would be a huge pain to load into a database
mo = pd.read_csv('data/mariano-rivera.csv')
mo.tail()
Out[2]:
pandas can fetch data from a URL ...
In [3]:
clean = lambda s: s.replace('$', '')[:-1] if '.' in s else s.replace('$', '') # a lot going on here
url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv'
sandwiches = pd.read_table(url, sep='\t', converters={'price': lambda s: float(clean(s))})
sandwiches.head(3)
Out[3]:
In [4]:
gh = pd.read_json('https://api.github.com/repos/pydata/pandas/issues?per_page=3')
gh[['body', 'created_at', 'title', 'url']].head(3)
Out[4]:
You'll likely need to do some parsing though - pandas read_json doesn't do well with nested JSON yet
Possibly my favorite way to read data into pandas ...
In [5]:
clip = pd.read_clipboard()
clip.head()
Out[5]:
In [6]:
# All of this is basically the same as it would be with Postgres, MySQL, or any other database
# Just pass pandas a connection object and it'll take care of the rest.
from pandas.io import sql
import sqlite3
conn = sqlite3.connect('data/towed.db')
query = "SELECT * FROM towed"
towed = sql.read_sql(query, con=conn, parse_dates={'date':'%m/%d/%Y'})
towed.head()
Out[6]:
We're going to work with the towed dataset for a bit.
We've already been using .head(), but there's also .tail(). You can also use standard Python slicing.
In [7]:
towed[100:105]
Out[7]:
In [8]:
towed.info() # empty string showing up as non-null value
In [9]:
mo.info() # note the nulls in the Awards column
In [10]:
mo.describe() # basic stats for any numeric column
Out[10]:
In [11]:
towed.set_index('date', inplace=True)
In [12]:
# SELECT *
# FROM towed
# WHERE date = '2014-11-04'
# LIMIT 5;
towed.ix['2014-11-04']
Out[12]:
In [13]:
towed.ix['2014-11-04', 'make'] # get a Series back (or individual values, if unique)
Out[13]:
In [14]:
# SELECT *
# FROM towed
# WHERE date = '2014-11-04';
towed.reset_index(inplace=True)
towed[towed['date'] == '2014-11-04']
Out[14]:
In [15]:
towed.set_index('date', inplace=True)
In [16]:
cols = ['make', 'color', 'state'] # just passing a list
towed[cols][:5]
Out[16]:
In [17]:
# SELECT *
# FROM towed
# WHERE make = 'FORD'
# ORDER BY state DESC
# LIMIT 5;
towed[towed['make'] == 'FORD'].sort('state', ascending=False).head()
Out[17]:
In [18]:
# SELECT *
# FROM towed
# WHERE color = 'BLK'
# AND state = 'MI';
towed[(towed.color == 'BLK') & (towed.state == 'MI')]
Out[18]:
In [19]:
# SELECT *
# FROM towed
# WHERE make = 'PORS'
# OR state = 'NY'
# ORDER BY make, color DESC;
towed[(towed.make == 'PORS') | (towed.state == 'NY')].sort(['make', 'color'], ascending=[True, False]) # not red Porshe
Out[19]:
In [20]:
m = (towed.style == '4D') & (towed.state == 'IL') # create a filter mask
In [21]:
m.value_counts() # it's just a boolean series
Out[21]:
In [22]:
# SELECT make, model, color
# FROM towed
# WHERE style = '4D' AND state = 'IL
# ORDER BY date
# LIMIT 5;
towed[m][['make', 'model', 'color']].sort().head()
Out[22]:
In [23]:
towed[towed.duplicated()] # this would suck in SQL ... but that's also what primary keys are for
Out[23]:
In [24]:
# thousands comma in tripduration field ... that's be really annoying to load into a DB
! egrep \"[0-9],[0-9]+\" data/divvy/Divvy_Trips_2013.csv | head -n 5
In [25]:
! head -n 5 data/divvy/Divvy_Stations_2013.csv
In [26]:
! head -n 5 data/weather.csv
In [27]:
from datetime import datetime
to_datetime = lambda d: datetime.strptime(d, '%m/%d/%Y %H:%M')
In [28]:
# if you know the datetime format, it's much, much faster to explicitly parse it
# see here: https://gist.github.com/gjreda/7433f5f70299610d9b6b
trips = pd.read_csv('data/divvy/Divvy_Trips_2013.csv',
converters={'starttime': to_datetime,'stoptime': to_datetime},
thousands=',')
stations = pd.read_csv('data/divvy/Divvy_Stations_2013.csv')
In [29]:
trips.info()
In [30]:
stations.info()
In [31]:
# CREATE TABLE divvy (
# SELECT *
# FROM trips
# LEFT JOIN stations
# ON trips.from_station_name = stations.name
# );
divvy = pd.merge(trips, stations, how='left', left_on='from_station_name', right_on='name')
divvy.info()
In [32]:
# SELECT trip_id, ... divvy.name AS name_origin ... stations.name AS name_dest
# FROM divvy
# INNER JOIN stations
# ON divvy.to_station_name = stations.name
divvy = pd.merge(divvy, stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_origin', '_dest'])
divvy.info()
In [33]:
df1 = pd.DataFrame({'a': np.random.randn(3), 'b': np.square(range(100,103))})
df2 = pd.DataFrame({'a': [np.NaN, 11, 99], 'b': np.random.randn(3)})
In [34]:
# SELECT a, b
# FROM df1
# UNION
# SELECT a, b
# FROM df2;
# takes a list, so you can pass 2+ dataframes
# note that the indexes are the same
pd.concat([df1, df2])
Out[34]:
In [35]:
# it's precise to the minute - want to see counts by nearest hour
divvy.starttime.value_counts().order()[:3]
Out[35]:
In [36]:
# I don't want to just extract the hour from the datetime ... NEAREST hour
from datetime import timedelta
def round_time(dt, mins=60):
"""
Rounds a datetime object to its nearest hour (by default).
(e.g. 2014-02-01 10:45:13 becomes 2014-02-01 11:00:00)
"""
dt += timedelta(minutes=mins/2.)
dt -= timedelta(minutes=dt.minute % mins, seconds=dt.second)
return dt
In [37]:
divvy['starthour'] = divvy.starttime.apply(round_time)
divvy['stophour'] = divvy.stoptime.apply(round_time)
In [38]:
divvy[['starttime', 'starthour']].head(3)
Out[38]:
pandas groupby draws largely from Hadley Wickham's Split Apply Combine Methodology for Data Analysis (it's a good paper; you should read it).
Count vs Size
In [39]:
# SELECT starthour, count(1)
# FROM divvy
# GROUP BY starthour
divvy.groupby('starthour').size()
Out[39]:
In [40]:
# SELECT starthour, avg(tripduration)
# FROM divvy
# GROUP BY starthour
# ORDER BY avg(tripduration) DESC
# LIMIT 5
divvy.groupby('starthour')['tripduration'].mean().order(ascending=False)[:5]
Out[40]:
In [41]:
divvy.groupby('usertype').agg({'birthyear': pd.Series.nunique, 'tripduration': [np.mean, np.median]})
Out[41]:
In [42]:
# number of trips started by starthour
divvy.groupby('starthour').size().plot(figsize=(16,8))
Out[42]:
In [43]:
# distribution of tripduration
divvy.tripduration.hist(figsize=(16,8), bins=1000)
plt.xlim(0, 10000);
In [44]:
duration_counts = divvy.tripduration.value_counts()
duration_counts.index.name = 'seconds'
duration_counts.name = 'trips'
duration_counts.head()
Out[44]:
In [45]:
df = duration_counts.reset_index()
df['minutes'] = df.seconds/60.
df.set_index('minutes', inplace=True)
df.sort(inplace=True)
In [46]:
(df.trips.cumsum() / df.trips.sum()).plot(figsize=(16,8))
plt.xlim(0, 60)
plt.yticks(np.arange(0, 1.1, 0.1));
In [47]:
plt.figure(figsize=(9, 18))
divvy.groupby('birthyear').size().order().plot(kind='barh')
Out[47]:
In [48]:
divvy.groupby(divvy['starttime'].apply(lambda d: d.dayofweek))['trip_id'].count().plot(kind='bar')
plt.title('Divvy trips by weekday') # 0 = Monday ...
plt.xlabel('Weekday')
plt.ylabel('# of trips started');
Plotting multiple lines and subplotting.
In [49]:
divvy['startdate'] = divvy.starthour.apply(lambda d: d.date())
by_gender = divvy.groupby(['startdate', 'gender']).size()
In [50]:
by_gender.head()
Out[50]:
In [51]:
by_gender.unstack(1).head()
Out[51]:
pandas wins here.
In [52]:
# SELECT startdate
# , COUNT(IF(gender = 'Female', 1, NULL))
# , COUNT(IF(gender = 'Male', 1, NULL))
# FROM divvy
# GROUP BY startdate
# LIMIT 5;
divvy.groupby(['startdate', 'gender']).size().unstack(1).head()
Out[52]:
In [53]:
by_gender.unstack(1).plot(figsize=(16,8))
Out[53]:
In [54]:
divvy.groupby(['startdate', 'usertype']).size().unstack(1).plot(figsize=(16,8), subplots=True);
In [55]:
weekdays = divvy['starttime'].apply(lambda d: d.dayofweek)
hours = divvy['starttime'].apply(lambda d: d.hour)
by_weekday_hour = divvy.groupby([weekdays, hours])['trip_id'].count()
by_weekday_hour.index.names = ['weekday', 'hour'] # rename MultiIndex
In [56]:
by_weekday_hour.unstack(0).plot(figsize=(16,8))
plt.title('Trips by weekday hour')
plt.ylabel('# of trips started')
plt.xlabel('Hour of day')
plt.xticks(range(24))
plt.xlim(0, 23);