In [2]:
import numpy as np
import pandas as pd
In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data
Out[3]:
In [4]:
# get np array
data.values
Out[4]:
In [5]:
data.index
Out[5]:
In [6]:
data[1]
Out[6]:
In [7]:
data[1:3]
Out[7]:
In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data
Out[8]:
In [9]:
data['b']
Out[9]:
In [10]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data
Out[10]:
In [11]:
data[5]
Out[11]:
In [12]:
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population
Out[12]:
In [13]:
population['California']
Out[13]:
In [14]:
population['California':'Illinois'] # inclusive
Out[14]:
In [15]:
pd.Series(5, index=[100, 200, 300])
Out[15]:
In [16]:
pd.Series({2:'a', 1:'b', 3:'c'})
Out[16]:
In [17]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])
Out[17]:
In [18]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area
Out[18]:
In [19]:
states = pd.DataFrame({'population': population,
'area': area})
states
Out[19]:
In [20]:
states.index
Out[20]:
In [21]:
states.columns
Out[21]:
In [22]:
states['area']
Out[22]:
In [23]:
pd.DataFrame(population, columns=['population'])
Out[23]:
In [24]:
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)
Out[24]:
In [25]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])
Out[25]:
In [26]:
pd.DataFrame({'population': population, 'area': area})
Out[26]:
In [27]:
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])
Out[27]:
In [28]:
pd.DataFrame(np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')]))
Out[28]:
In [29]:
ind = pd.Index([2, 3, 5, 7, 11])
print ind
print ind[1]
print ind[::2]
# cannot ind[1] = 0, ind is immutable
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print indA & indB
print indA | indB
print indA ^ indB
In [30]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print data['b']
print 'a' in data
print data.keys()
print list(data.iteritems())
data['e'] = 1.25
print data
In [31]:
print data['a':'c']
print data[0:2]
print data[(data > 0.3) & (data < 0.8)]
print data[['a', 'e']]
In [32]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
# loc attribute allows indexing and slicing that always references the explicit index
print data.loc[1]
print data.loc[1:3]
print
# iloc attribute allows indexing and slicing that always references the implicit Python-style index
print data.iloc[1]
print data.iloc[1:3]
In [33]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data
Out[33]:
In [34]:
print data['area']
print data.area
print data['area'] is data.area
print data['pop'] is data.pop # DataFrame has a pop() method, so data.pop will point to this method
In [35]:
data['density'] = data['pop'] / data['area']
data
Out[35]:
In [36]:
data.values
Out[36]:
In [37]:
data.T
Out[37]:
In [38]:
data.values[0]
Out[38]:
In [39]:
data['area']
Out[39]:
In [40]:
data.iloc[:3, :2]
Out[40]:
In [41]:
data.loc[:'Illinois', :'pop']
Out[41]:
In [42]:
# The ix indexer allows a hybrid of these two approaches
data.ix[:3, :'pop']
Out[42]:
In [43]:
data.loc[data.density > 100, ['pop', 'density']]
Out[43]:
In [44]:
print data['Florida':'Illinois']
print data[1:3]
print data[data.density > 100]
In [45]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser
Out[45]:
In [46]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
df
Out[46]:
In [47]:
np.exp(ser)
Out[47]:
In [48]:
np.sin(df * np.pi / 4)
Out[48]:
In [49]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')
population / area
Out[49]:
In [50]:
area.index | population.index
Out[50]:
In [51]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A+B
Out[51]:
In [52]:
A.add(B, fill_value=0)
Out[52]:
In [53]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
A
Out[53]:
In [54]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B
Out[54]:
In [55]:
A + B
Out[55]:
In [56]:
A.stack()
Out[56]:
In [57]:
A.stack().mean()
Out[57]:
In [58]:
A.add(B, fill_value=A.stack().mean())
Out[58]:
In [59]:
A = rng.randint(10, size=(3, 4))
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]
Out[59]:
In [60]:
df.subtract(df['R'], axis=0)
Out[60]:
In [61]:
halfrow = df.iloc[0, ::2]
halfrow
Out[61]:
In [62]:
df - halfrow
Out[62]:
In [63]:
vals1 = np.array([1, None, 3, 4])
vals1
Out[63]:
In [64]:
# This dtype=object means that the best common type representation NumPy could infer
# for the contents of the array is that they are Python objects. While this kind of
# object array is useful for some purposes, any operations on the data will be done
# at the Python level, with much more overhead than the typically fast operations seen
# for arrays with native types
for dtype in ['object', 'int']:
print("dtype =", dtype)
%timeit np.arange(1E6, dtype=dtype).sum()
print()
In [65]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype
Out[65]:
In [66]:
print 1 + np.nan
print 0 * np.nan
print vals2.sum(), vals2.min(), vals2.max()
In [67]:
print np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)
In [68]:
print pd.Series([1, np.nan, 2, None])
x = pd.Series(range(2), dtype=int)
print x
x[0] = None
print x
In [69]:
data = pd.Series([1, np.nan, 'hello', None])
print data.isnull()
print data.notnull()
In [70]:
data.dropna()
Out[70]:
In [71]:
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
df
Out[71]:
In [72]:
df.dropna()
Out[72]:
In [73]:
df.dropna(axis='columns')
Out[73]:
In [74]:
df[3] = np.nan
df
Out[74]:
In [75]:
df.dropna(axis='columns', how='all')
Out[75]:
In [76]:
df.dropna(axis='rows', thresh=3) # rows with at least 3 non-nan vals
Out[76]:
In [77]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data
Out[77]:
In [78]:
data.fillna(0)
Out[78]:
In [79]:
# forward-fill
data.fillna(method='ffill')
Out[79]:
In [80]:
# backward-fill
data.fillna(method='bfill')
Out[80]:
In [81]:
df
Out[81]:
In [82]:
df.fillna(method='ffill', axis=1)
Out[82]:
In [83]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
pop
Out[83]:
In [84]:
pop[('California', 2010):('Texas', 2000)]
Out[84]:
In [85]:
pop[[i for i in pop.index if i[1] == 2010]]
Out[85]:
In [86]:
index = pd.MultiIndex.from_tuples(index)
index
Out[86]:
In [87]:
pop = pop.reindex(index)
pop
Out[87]:
In [88]:
pop[:, 2010]
Out[88]:
In [89]:
pop_df = pop.unstack()
pop_df
Out[89]:
In [90]:
pop_df.stack()
Out[90]:
In [91]:
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
pop_df
Out[91]:
In [92]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()
Out[92]:
In [93]:
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df
Out[93]:
In [94]:
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data)
Out[94]:
In [95]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
Out[95]:
In [96]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
Out[96]:
In [97]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
Out[97]:
In [98]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
Out[98]:
In [99]:
pop.index.names = ['state', 'year']
pop
Out[99]:
In [100]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
Out[100]:
In [101]:
health_data['Guido']
Out[101]:
In [102]:
pop
Out[102]:
In [103]:
pop['California', 2000]
Out[103]:
In [104]:
pop['California']
Out[104]:
In [105]:
pop.loc['California':'New York']
Out[105]:
In [106]:
pop[:, 2000]
Out[106]:
In [107]:
pop[pop > 22000000]
Out[107]:
In [108]:
pop[['California', 'Texas']]
Out[108]:
In [109]:
health_data
Out[109]:
In [113]:
health_data['Guido', 'HR']
Out[113]:
In [111]:
health_data.iloc[:2, :2]
Out[111]:
In [114]:
health_data.loc[:, ('Bob', 'HR')]
Out[114]:
In [115]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]
Out[115]:
In [116]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data
Out[116]:
In [117]:
try:
data['a':'b']
except KeyError as e:
print(type(e))
print(e)
In [118]:
data = data.sort_index()
data
Out[118]:
In [119]:
data['a':'b']
Out[119]:
In [120]:
pop.unstack(level=0)
Out[120]:
In [121]:
pop.unstack(level=1)
Out[121]:
In [122]:
pop.unstack().stack()
Out[122]:
In [123]:
# optionally specify the name of the data for the column representation
pop_flat = pop.reset_index(name='population')
pop_flat
Out[123]:
In [124]:
pop_flat.set_index(['state', 'year'])
Out[124]:
In [125]:
health_data
Out[125]:
In [126]:
data_mean = health_data.mean(level='year')
data_mean
Out[126]:
In [132]:
data_mean.mean(axis=1, level='type')
Out[132]:
In [1]:
import numpy as np
import pandas as pd
def make_df(cols, ind):
"""Quickly make a DataFrame"""
data = {c: [str(c) + str(i) for i in ind] for c in cols}
return pd.DataFrame(data, ind)
# example DataFrame
make_df('ABC', range(3))
Out[1]:
In [2]:
x=[1,2,3]
y=[4,5,6]
z=[7,8,9]
np.concatenate([x, y, z])
Out[2]:
In [3]:
x = [[1, 2],
[3, 4]]
np.concatenate([x, x], axis=1)
Out[3]:
In [4]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])
Out[4]:
In [5]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1)
print(df2)
print(pd.concat([df1, df2]))
In [6]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3)
print(df4)
print(pd.concat([df3, df4], axis='col'))
In [7]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index # make duplicate indices!
print(x)
print(y)
print(pd.concat([x, y]))
In [8]:
try:
# verify_integrity=True will disable duplicate indices
pd.concat([x, y], verify_integrity=True)
except ValueError as e:
print("ValueError:", e)
In [9]:
print(x)
print(y)
print(pd.concat([x, y], ignore_index=True))
In [10]:
print(x)
print(y)
print(pd.concat([x, y], keys=['x', 'y']))
In [11]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print(df6)
print(pd.concat([df5, df6]))
In [12]:
print(df5)
print(df6)
print(pd.concat([df5, df6], join_axes=[df5.columns]))
In [13]:
print(df1)
print(df2)
print(df1.append(df2)) # return a copy
In [14]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
In [15]:
df3 = pd.merge(df1, df2)
df3
Out[15]:
In [16]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3)
print(df4)
print(pd.merge(df3, df4))
In [17]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
print(df1)
print(df5)
print(pd.merge(df1, df5))
In [18]:
print(df1)
print(df2)
print(pd.merge(df1, df2, on='employee'))
In [19]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
print(df1)
print(df3)
print(pd.merge(df1, df3, left_on="employee", right_on="name"))
In [20]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)
Out[20]:
In [21]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a)
print(df2a)
In [22]:
print(df1a)
print(df2a)
print(pd.merge(df1a, df2a, left_index=True, right_index=True))
In [23]:
print(df1a)
print(df2a)
# For convenience, DataFrames implement the join() method
# which performs a merge that defaults to joining on indices:
print(df1a.join(df2a))
In [24]:
print(df1a)
print(df3)
print(pd.merge(df1a, df3, left_index=True, right_on='name'))
In [25]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
print(df6)
print(df7)
print(pd.merge(df6, df7))
In [26]:
pd.merge(df6, df7, how='inner')
Out[26]:
In [27]:
pd.merge(df6, df7, how='outer')
Out[27]:
In [28]:
pd.merge(df6, df7, how='left') # Using how='right' works in a similar manner.
Out[28]:
In [29]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
print(df8)
print(df9)
print(pd.merge(df8, df9, on="name"))
In [30]:
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))
In [31]:
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv
In [32]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head())
print(areas.head())
print(abbrevs.head())
In [34]:
merged = pd.merge(pop, abbrevs, how='outer',
left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info merged.head()
merged.head()
Out[34]:
In [35]:
merged.isnull().any()
Out[35]:
In [36]:
merged[merged['population'].isnull()].head()
Out[36]:
In [37]:
merged.loc[merged['state'].isnull(), 'state/region'].unique()
Out[37]:
In [38]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()
Out[38]:
In [41]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()
Out[41]:
In [42]:
final.isnull().any()
Out[42]:
In [43]:
final['state'][final['area (sq. mi)'].isnull()].unique()
Out[43]:
In [44]:
final.dropna(inplace=True)
final.head()
Out[44]:
In [45]:
final['area (sq. mi)'].isnull().any()
Out[45]:
In [46]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()
Out[46]:
In [47]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True)
density.head()
Out[47]:
In [48]:
density.tail()
Out[48]:
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape
Out[1]:
In [2]:
planets.head()
Out[2]:
In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser
Out[3]:
In [4]:
ser.mean(), ser.sum()
Out[4]:
In [5]:
df = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
df
Out[5]:
In [6]:
df.mean()
Out[6]:
In [7]:
df.mean(axis='columns')
Out[7]:
In [8]:
planets.dropna().describe()
Out[8]:
In [10]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)}, columns=['key', 'data'])
df
Out[10]:
In [11]:
df.groupby('key').sum()
Out[11]:
In [12]:
planets.groupby('method')
Out[12]:
In [13]:
planets.groupby('method')['orbital_period']
Out[13]:
In [14]:
planets.groupby('method')['orbital_period'].median()
Out[14]:
In [15]:
for (method, group) in planets.groupby('method'):
print("{0:30s} shape={1}".format(method, group.shape))
In [17]:
planets.groupby('method')['year'].describe().unstack()
Out[17]:
In [18]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
Out[18]:
In [19]:
df.groupby('key').aggregate(['min', np.median, max])
Out[19]:
In [20]:
df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})
Out[20]:
In [21]:
print(df.groupby('key').std())
print(df.groupby('key').filter(lambda x: x['data2'].std() > 4))
In [22]:
df.groupby('key').transform(lambda x: x - x.mean())
Out[22]:
In [26]:
def norm_by_data2(x):
# x is a DataFrame of group values
x['data1'] /= x['data2'].sum()
return x
print(df.groupby('key').apply(norm_by_data2))
In [27]:
L=[0,1,0,1,2,0]
print(df)
print(df.groupby(L).sum())
In [28]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2)
print(df2.groupby(mapping).sum())
In [29]:
df2.groupby(str.lower).mean()
Out[29]:
In [30]:
df2.groupby([str.lower, mapping]).mean()
Out[30]:
In [31]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)
Out[31]:
In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
Out[42]:
In [43]:
titanic.groupby('sex')['survived'].mean()
Out[43]:
In [45]:
titanic.groupby(['sex', 'class'])['survived'].mean().unstack()
Out[45]:
In [46]:
titanic.pivot_table('survived', index='sex', columns='class')
Out[46]:
In [47]:
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
Out[47]:
In [50]:
fare = pd.qcut(titanic['fare'], 2) # split into 2 equal-size groups
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
Out[50]:
In [51]:
titanic.pivot_table(index='sex', columns='class',
aggfunc={'survived':np.sum, 'fare':'mean'})
Out[51]:
In [52]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True)
Out[52]:
In [53]:
# !curl -O https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv
In [54]:
births = pd.read_csv('births.csv')
births.head()
Out[54]:
In [55]:
births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births', index='decade', columns='gender', aggfunc='sum')
Out[55]:
In [56]:
%matplotlib inline
import matplotlib.pyplot as plt
sns.set() # use Seaborn styles
births.pivot_table('births', index='year', columns='gender', aggfunc='sum').plot()
plt.ylabel('total births per year')
Out[56]:
In [57]:
quartiles = np.percentile(births['births'], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
births['day'] = births['day'].astype(int)
births.index = pd.to_datetime(10000 * births.year +
100 * births.month +
births.day, format='%Y%m%d')
births['dayofweek'] = births.index.dayofweek
births.pivot_table('births', index='dayofweek',columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day')
In [58]:
births_by_date = births.pivot_table('births', [births.index.month, births.index.day])
births_by_date.head()
Out[58]:
In [59]:
births_by_date.index = [pd.datetime(2012, month, day) for (month, day) in births_by_date.index]
births_by_date.head()
Out[59]:
In [60]:
fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax)
Out[60]:
In [1]:
import numpy as np
import pandas as pd
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names
Out[1]:
In [2]:
names.str.capitalize()
Out[2]:
In [3]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.lower()
Out[3]:
In [4]:
monte.str.len()
Out[4]:
In [5]:
monte.str.startswith('T')
Out[5]:
In [6]:
monte.str.split()
Out[6]:
In [8]:
monte.str.extract('([A-Za-z]+)')
Out[8]:
In [9]:
monte.str.findall('^[^AEIOU].*[^aeiou]$')
Out[9]:
In [10]:
monte.str[0:3]
Out[10]:
In [11]:
monte.str.split().str.get(-1)
Out[11]:
In [12]:
full_monte = pd.DataFrame({'name': monte,
'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
full_monte
Out[12]:
In [13]:
full_monte['info'].str.get_dummies('|')
Out[13]:
In [14]:
# !curl -O http://openrecipes.s3.amazonaws.com/openrecipes.txt
In [16]:
# read the entire file into a Python array
with open('openrecipes.txt', 'r') as f:
# Extract each line
data = (line.strip() for line in f)
# Reformat so each line is the element of a list
data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(data_json)
recipes.shape
Out[16]:
In [17]:
recipes.head()
Out[17]:
In [18]:
recipes.ingredients.str.len().describe()
Out[18]:
In [19]:
recipes.name[np.argmax(recipes.ingredients.str.len())]
Out[19]:
In [20]:
recipes.description.str.contains('[Bb]reakfast').sum()
Out[20]:
In [21]:
recipes.ingredients.str.contains('[Cc]innamon').sum()
Out[21]:
In [22]:
recipes.ingredients.str.contains('[Cc]inamon').sum()
Out[22]:
In [23]:
import re
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
for spice in spice_list))
spice_df.head()
Out[23]:
In [26]:
spice_df.sum(axis=0)
Out[26]:
In [28]:
selection = spice_df.query('salt & pepper & cumin')
len(selection)
Out[28]:
In [29]:
recipes.name[selection.index]
Out[29]:
In [1]:
from datetime import datetime
datetime(year=2015, month=7, day=4)
Out[1]:
In [2]:
from dateutil import parser
date = parser.parse("4th of July, 2015")
date
Out[2]:
In [3]:
date.strftime('%A')
Out[3]:
In [4]:
import numpy as np
date = np.array('2015-07-04', dtype=np.datetime64)
date
Out[4]:
In [5]:
date + np.arange(12)
Out[5]:
In [6]:
np.datetime64('2015-07-04')
Out[6]:
In [7]:
np.datetime64('2015-07-04 12:00')
Out[7]:
In [8]:
np.datetime64('2015-07-04 12:59:59.50', 'ns')
Out[8]:
In [9]:
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date
Out[9]:
In [10]:
date.strftime('%A')
Out[10]:
In [11]:
date + pd.to_timedelta(np.arange(12), 'D')
Out[11]:
In [12]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data
Out[12]:
In [13]:
data['2014-07-04':'2015-07-04']
Out[13]:
In [14]:
data['2015']
Out[14]:
In [15]:
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
'2015-Jul-6', '07-07-2015', '20150708'])
dates
Out[15]:
In [16]:
dates.to_period('D')
Out[16]:
In [17]:
dates - dates[0]
Out[17]:
In [18]:
pd.date_range('2015-07-03', '2015-07-10')
Out[18]:
In [19]:
pd.date_range('2015-07-03', periods=8)
Out[19]:
In [20]:
pd.date_range('2015-07-03', periods=8, freq='H')
Out[20]:
In [21]:
pd.period_range('2015-07-03', periods=8, freq='M')
Out[21]:
In [22]:
pd.timedelta_range(0, periods=10, freq='H')
Out[22]:
In [23]:
pd.timedelta_range(0, periods=8, freq='2H30T')
Out[23]:
In [26]:
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods=10, freq=BDay())
Out[26]:
In [1]:
# import pandas_datareader as pdr
# goog = data.DataReader('GOOG', start='2004', end='2016', data_source='google')
# goog = pdr.get_data_yahoo('GOOG')
import pandas as pd
goog = pd.read_csv('goog.csv')
goog['Date'] = pd.to_datetime(goog['Date'])
# goog.sort_values(by='Date', inplace=True)
goog.set_index('Date', inplace=True)
goog.sort_index(inplace=True)
goog.head()
Out[1]:
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
goog = goog['Close']
goog.plot()
Out[2]:
In [3]:
goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--')
plt.legend(['input', 'resample', 'asfreq'], loc='upper left')
Out[3]:
In [4]:
# resample() is fundamentally a data aggregation
goog.resample('BA').mean()
# goog['2014'].mean() == 75.825213
Out[4]:
In [5]:
# asfreq() is fundamentally a data selection
goog.asfreq('BA')
# goog['2004-12-31'] == 96.29999..
Out[5]:
In [6]:
fig, ax = plt.subplots(2, sharex=True)
data = goog.iloc[:10]
data.asfreq('D').plot(ax=ax[0], marker='o')
data.asfreq('D', method='bfill').plot(ax=ax[1], style='-o')
data.asfreq('D', method='ffill').plot(ax=ax[1], style='--o')
ax[1].legend(["back-fill", "forward-fill"]);
In [7]:
data.asfreq('D')
Out[7]:
In [8]:
fig, ax = plt.subplots(3, sharey=True)
# apply a frequency to the data
goog = goog.asfreq('D', method='pad')
goog.plot(ax=ax[0])
goog.shift(900).plot(ax=ax[1]) # shift the data
goog.tshift(900).plot(ax=ax[2]) # shift the date index
# legends and annotations
local_max = pd.to_datetime('2007-11-05')
offset = pd.Timedelta(900, 'D')
ax[0].legend(['input'], loc=2)
ax[0].get_xticklabels()[4].set(weight='heavy', color='red')
ax[0].axvline(local_max, alpha=0.3, color='red')
ax[1].legend(['shift(900)'], loc=2)
ax[1].get_xticklabels()[4].set(weight='heavy', color='red')
ax[1].axvline(local_max + offset, alpha=0.3, color='red')
ax[2].legend(['tshift(900)'], loc=2)
ax[2].get_xticklabels()[1].set(weight='heavy', color='red')
ax[2].axvline(local_max + offset, alpha=0.3, color='red');
In [15]:
ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot()
plt.ylabel('% Return on Investment');
In [16]:
rolling = goog.rolling(365, center=True)
data = pd.DataFrame({'input': goog,
'one-year rolling_mean': rolling.mean(),
'one-year rolling_std': rolling.std()})
ax = data.plot(style=['-', '--', ':'])
ax.lines[0].set_alpha(0.3)
In [19]:
# !curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD
In [20]:
data = pd.read_csv('FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()
Out[20]:
In [22]:
data.columns = ['west', 'east']
data['total'] = data.eval('west + east')
data.dropna().describe()
Out[22]:
In [23]:
data.plot(alpha=0.5)
plt.ylabel('hourly bicycle count')
Out[23]:
In [25]:
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('weekly bicycle count')
Out[25]:
In [26]:
daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('rolling mean hourly count')
Out[26]:
In [28]:
daily.rolling(50, center=True, win_type='gaussian').sum(std=10).plot(style=[':', '--', '-'])
Out[28]:
In [30]:
import numpy as np
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks=hourly_ticks, style=[':', '--', '-'])
Out[30]:
In [31]:
data.groupby(data.index.time).mean()
Out[31]:
In [32]:
data.index
Out[32]:
In [33]:
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-'])
Out[33]:
In [35]:
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_time.ix['Weekday'].plot(ax=ax[0], title='weekday', style=[':', '--', '-'], xticks=hourly_ticks)
by_time.ix['Weekend'].plot(ax=ax[1], title='weekend', style=[':', '--', '-'], xticks=hourly_ticks)
Out[35]:
In [36]:
by_time
Out[36]:
In [1]:
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1E6)
y = rng.rand(1E6)
%timeit x + y
In [2]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))
In [3]:
mask=(x>0.5)&(y<0.5) # every intermediate step is explicitly allocated in memory
# preferred way
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)
Out[3]:
In [5]:
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for _ in range(4))
In [6]:
%timeit df1 + df2 + df3 + df4
In [7]:
%timeit pd.eval('df1 + df2 + df3 + df4')
In [8]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))
Out[8]:
In [9]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))
In [10]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)
Out[10]:
In [11]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)
Out[11]:
In [12]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)
Out[12]:
In [13]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)
Out[13]:
In [14]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)
Out[14]:
In [15]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()
Out[15]:
In [16]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)
Out[16]:
In [17]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)
Out[17]:
In [18]:
df.eval('D = (A + B) / C', inplace=True)
df.head()
Out[18]:
In [19]:
df.eval('D = (A - B) / C', inplace=True)
df.head()
Out[19]:
In [20]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)
Out[20]:
In [21]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)
Out[21]:
In [22]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)
Out[22]:
In [23]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
Out[23]:
In [ ]:
# The benefit of eval/query is mainly in the saved memory, and the sometimes cleaner syntax they offer.