In [1]:
import pandas as pd
import numpy as np
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)
import sys
print ("Python version:", sys.version)
In [2]:
data = pd.Series([5, 10, 15, 20, 25])
data
Out[2]:
In [3]:
# to access the values of pandas series
data.values
Out[3]:
In [4]:
# access the index of pandas series
data.index
Out[4]:
In [5]:
# access data via associated index
print(data[1])
print('--------')
print(data[3:5])
In [6]:
#index need not be an integer
dt = pd.Series([1, 2, 3, 4, 5],
index=['a', 'b', 'c', 'd', 'e'])
dt
Out[6]:
In [7]:
# access the data as expected
dt['c']
Out[7]:
In [8]:
# Series as specialized dictionary
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population
Out[8]:
In [9]:
population['California']
Out[9]:
In [10]:
# unlike dictionary, Series support slicing
population['California' : 'Illinous']
Out[10]:
In [11]:
# 1- from a list or numpy array
pd.Series([1, 2, 4])
Out[11]:
In [12]:
# 2a- from a dictionary
pd.Series({1:'a', 5:'c' , 2:'g'}, index=[1,5,2])
Out[12]:
In [13]:
# 2b- from a dictionary
pd.Series({'a': 1, 'c':5, 'b':67}, index=['a','b','c'])
Out[13]:
In [14]:
# lets create another pandas series object first
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area
Out[14]:
In [15]:
# use dictionary to create a DataFrame from population and area Series objects
states = pd.DataFrame({'population': population , 'area':area})
states
Out[15]:
In [16]:
# access the index
states.index
Out[16]:
In [17]:
# access column names
states.columns
Out[17]:
In [18]:
# access values
states.values
Out[18]:
In [19]:
# access a column
states['area']
Out[19]:
In [20]:
# 1- from a Series object
pd.DataFrame(population, columns=['population'])
Out[20]:
In [21]:
# 2- from a list of dictionary
data = [{'a': 1, 'b':2}
for i in range(3)]
pd.DataFrame(data)
Out[21]:
In [22]:
# 3- from a dictionary of Series object
pd.DataFrame({'population':population , 'area':area})
Out[22]:
In [23]:
# 4- from 2d Numpy array
pd.DataFrame(np.random.rand(3,2),
columns=['aaa', 'bbb'],
index=['a','b','c'])
Out[23]:
In [24]:
ind = pd.Index([1,2,5,16])
ind
Out[24]:
In [25]:
# access index values
ind[3]
Out[25]:
In [26]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)
In [27]:
# indexes are immutable. code below would throw an error message
#ind[3] = 0
In [28]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
data
Out[28]:
In [29]:
data['b']
Out[29]:
In [30]:
'a' in data
Out[30]:
In [31]:
1.0 in data
Out[31]:
In [32]:
data.keys()
Out[32]:
In [33]:
list(data.items())
Out[33]:
In [34]:
# assign a new index and an index value
data['e'] = 1.25
data
Out[34]:
In [35]:
# slicing explicitly
data['b':'e']
Out[35]:
In [36]:
# slicing implicitly
data[0:2]
Out[36]:
In [37]:
# masking
data[(data > 0.3) & (data < 0.8)]
Out[37]:
In [38]:
# fancy indexing
data[['a', 'e']]
Out[38]:
In [39]:
data.loc['a']
Out[39]:
In [40]:
data.loc[['a', 'c']]
Out[40]:
In [41]:
data.loc['a':'d']
Out[41]:
In [42]:
data.iloc[1]
Out[42]:
In [43]:
data.iloc[1:3]
Out[43]:
In [44]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'population':pop})
data
Out[44]:
In [45]:
# access the individual Series that make up the column
data['area']
Out[45]:
In [46]:
# equivalent code
data.area
Out[46]:
In [47]:
data.area is data['area']
Out[47]:
In [48]:
data.area == data['area']
Out[48]:
In [49]:
data['density'] = data['population'] / data['area']
data
Out[49]:
In [50]:
data.values
Out[50]:
In [51]:
# 2D DataFrame
data.T
Out[51]:
In [52]:
data.values[0]
Out[52]:
In [53]:
data.values[1]
Out[53]:
In [54]:
data['area']
Out[54]:
In [55]:
data
Out[55]:
In [56]:
# iloc
data.iloc[1]
Out[56]:
In [57]:
data.iloc[1:3]
Out[57]:
In [58]:
data.iloc[1:3, 0:2]
Out[58]:
In [59]:
data.loc['Florida']
Out[59]:
In [60]:
data.loc[['Florida', 'Illinois']]
Out[60]:
In [61]:
data.loc[['Florida','Illinois']][['area']]
Out[61]:
In [62]:
data.loc['Florida':'Illinois', :'population']
Out[62]:
In [63]:
# ix indexer is hybrid of loc and iloc
data.ix[1:3 , :'population']
Out[63]:
In [64]:
data.ix[1:3 , ['area', 'population']]
Out[64]:
In [65]:
# use masking and fancy indexing with loc
data.loc[data['density'] > 100 , ['area', 'density']]
Out[65]:
In [66]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser
Out[66]:
In [67]:
df = pd.DataFrame(rng.randint(0, 10, (3,4)),
columns=['A','B','C','D'])
df
Out[67]:
In [68]:
np.exp(ser)
Out[68]:
In [69]:
def expo(x):
y = np.e ** x
return y
In [70]:
expo(6)
Out[70]:
In [71]:
np.sin(df * np.pi / 4)
Out[71]:
In [72]:
# index alignment
area = pd.Series({'California': 423967, 'Texas': 695662,
'Alaska':1723337})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, })
In [73]:
pop / area
Out[73]:
In [74]:
# any missing values are filled in with NaN by default in Python
A = pd.Series([2, 4, 6], index=[0,1,2])
B = pd.Series([1, 3, 5], index=[1,2,3])
A + B
Out[74]:
In [75]:
# use fill_value
A.add(B, fill_value=0)
Out[75]:
In [76]:
# index aligment in DataFrames
A = pd.DataFrame(rng.randint(0, 20, (2,2)),
columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3,3)),
columns=list('BAC'))
A
Out[76]:
In [77]:
B
Out[77]:
In [78]:
A + B
Out[78]:
In [79]:
# fill the missing values with the mean of all values in A
fill = A.stack().mean()
A.add(B, fill_value=fill)
Out[79]:
In [80]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
In [81]:
index = pd.MultiIndex.from_tuples(index)
index
Out[81]:
In [82]:
pop = pop.reindex(index)
pop
Out[82]:
In [83]:
# all data for which the second index is 2010
pop[:,2010]
Out[83]:
In [84]:
pop.loc['California']
Out[84]:
In [85]:
pop.ix['California', 2010]
Out[85]:
In [86]:
pop.unstack()
Out[86]:
In [87]:
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
pop_df
Out[87]:
In [88]:
pop_df[pop_df['under18'] > 6000000]
Out[88]:
In [89]:
pop.unstack().reset_index()
Out[89]:
In [90]:
pop = pd.read_csv('data/state-population.csv')
areas = pd.read_csv('data/state-areas.csv')
abbrevs = pd.read_csv('data/state-abbrevs.csv')
In [91]:
print(pop.head()); print(areas.head()); print(abbrevs.head())
In [92]:
merged = pd.merge(pop, abbrevs, how='outer', left_on='state/region', right_on='abbreviation')
merged.head()
Out[92]:
In [93]:
merged = merged.drop('abbreviation', 1)
In [94]:
merged.head()
Out[94]:
In [95]:
merged.isnull().any()
Out[95]:
In [96]:
# number of nulls
merged.isnull().sum()
Out[96]:
In [97]:
merged[merged['population'].isnull()].head()
Out[97]:
In [98]:
merged.loc[merged['state'].isnull(), 'state/region'].unique()
Out[98]:
In [99]:
merged.loc[merged['state/region']=='PR', 'state'] = 'Puerto Rico'
In [100]:
merged.loc[merged['state/region']=='USA', 'state'] = 'United States'
In [101]:
merged.isnull().sum()
Out[101]:
In [102]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()
Out[102]:
In [103]:
final.isnull().sum()
Out[103]:
In [104]:
final.dropna(inplace=True)
In [105]:
final.head()
Out[105]:
In [106]:
data2010 = final.query("year==2010 & ages == 'total'" )
data2010.head()
Out[106]:
In [107]:
df = pd.DataFrame({'A' : np.random.randint(0, 10, size=4),
'B' : np.random.randint(0, 10, size=4)})
df
Out[107]:
In [108]:
df['A'].sum()
Out[108]:
In [109]:
df['B'].mean()
Out[109]:
In [110]:
df.mean(axis='rows')
Out[110]:
In [111]:
# product of all items
df.prod()
Out[111]:
In [112]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape
Out[112]:
In [113]:
planets.head()
Out[113]:
In [114]:
planets.dropna().describe()
Out[114]:
In [115]:
planets.describe()
Out[115]:
In [116]:
df = pd.DataFrame({'key':['A','B','C','A','B','C'],
'data':range(6)}, columns=['key', 'data'])
df
Out[116]:
In [117]:
df.groupby('key')
Out[117]:
In [118]:
df.groupby('key').sum()
Out[118]:
In [119]:
df.groupby('key')['data'].median()
Out[119]:
In [120]:
df
Out[120]:
In [121]:
df.groupby('key').aggregate([sum, max, np.median])
Out[121]:
In [122]:
planets.groupby('method')['orbital_period'].median()
Out[122]:
In [123]:
planets.groupby('method')['year'].describe()
Out[123]:
In [124]:
len(pd.unique(planets.method))
Out[124]:
In [125]:
pd.unique(planets.method).shape
Out[125]:
In [126]:
planets.groupby('method').sum()
Out[126]:
In [127]:
planets.groupby('method')[['year', 'mass']].sum()
Out[127]:
In [128]:
planets.groupby('method').aggregate([sum, np.mean])
Out[128]:
In [129]:
%matplotlib inline
planets.groupby('method').sum().plot(kind='bar')
Out[129]:
In [130]:
import matplotlib.pyplot as plt
%matplotlib inline
In [131]:
planets.groupby('year').sum()['number']
Out[131]:
In [132]:
plt.figure(figsize=(12, 5))
plt.plot(planets.groupby('year').sum()['number'])
plt.style.use('fivethirtyeight')
plt.title('Number of planets discovered by year')
Out[132]:
In [133]:
planets.head()
Out[133]:
In [134]:
planets.boxplot(column=['distance'],
figsize=(12, 10),
by=['method'])
Out[134]:
In [135]:
planets[['distance', 'year']].head()
Out[135]:
In [136]:
plt.figure(figsize=(13, 7))
plt.plot(planets.year, planets.distance, 'o');
In [137]:
import seaborn as sns
In [138]:
# regression
sns.lmplot(x='year', y='distance', data=planets, size=8);
Grouping example
In [139]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
In [140]:
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)
Out[140]:
In [141]:
titanic = sns.load_dataset('titanic')
titanic.head()
Out[141]:
In [142]:
titanic.shape
Out[142]:
In [143]:
# number of missing data for each column
titanic.isnull().sum()
Out[143]:
In [144]:
titanic.groupby('sex')[['survived']].mean()
Out[144]:
In [145]:
titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean').unstack()
Out[145]:
In [146]:
# pivot table
titanic.pivot_table('survived', index='sex', columns='class')
Out[146]:
In [147]:
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
Out[147]:
In [148]:
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
Out[148]:
In [149]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True)
Out[149]:
In [150]:
births = pd.read_csv('data//births.csv')
births.head()
Out[150]:
In [151]:
births['decade'] = 10 * (births['year'] // 10)
births.head()
Out[151]:
In [152]:
births.pivot_table('births', index='year',columns='gender', aggfunc='sum' ).plot()
plt.ylabel('total births per year');
In [153]:
# sigma clipping to remove outliars
quartiles = np.percentile(births['births'], [25,50,75])
quartiles
Out[153]:
In [154]:
mu = quartiles[1]
mu
Out[154]:
In [155]:
sig = 0.74 * (quartiles[2] - quartiles[0])
sig
Out[155]:
In [156]:
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
In [157]:
# set day column to integer
births['day'] = births['day'].astype(int)
In [158]:
births.info()
In [159]:
births.index = pd.to_datetime(10000 * births.year +
100 * births.month +
births.day,
format='%Y%m%d')
births['dayofweek'] = births.index.dayofweek
births.head()
Out[159]:
In [160]:
births.pivot_table('births', index='dayofweek', columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Monday', 'Tuesday', 'Wed', 'Thursday', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day');
In [161]:
births_by_day = births.pivot_table('births', [births.index.month, births.index.day])
births_by_day.head()
Out[161]:
In [162]:
births_by_day.index = [pd.datetime(2012, month, day) for (month, day) in births_by_day.index]
In [163]:
births_by_day.head()
Out[163]:
In [164]:
fig, ax = plt.subplots(figsize = (12, 4))
births_by_day.plot(ax =ax);
In [165]:
date = np.array('2015-04-03', dtype=np.datetime64)
date
Out[165]:
In [166]:
date + 2
Out[166]:
In [167]:
date + np.arange(10)
Out[167]:
In [168]:
date2 = pd.to_datetime('3th of Feb, 1895')
date2.strftime('%A')
Out[168]:
In [169]:
date = date + np.arange(5)
index = pd.DatetimeIndex(date)
index
Out[169]:
In [170]:
data = pd.Series([0,1,2,3,4], index = index)
data
Out[170]:
In [171]:
data.loc['2015-04-06']
Out[171]:
In [172]:
data.loc['2015-04-06'] - data.loc['2015-04-04']
Out[172]:
In [173]:
#!pip install pandas_Datareader
In [174]:
from pandas_datareader import data
In [175]:
goog = data.DataReader('GOOG', start='2014', end='2016', data_source='google')
goog.head()
Out[175]:
In [176]:
goog = goog['Close']
In [177]:
goog.plot();
In [178]:
goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--')
plt.legend(['input', 'resample','asfreq'], loc='upper left');
In [179]:
goog = goog.asfreq('D', method='pad')
ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot();
In [180]:
URL = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'
from urllib.request import urlretrieve
urlretrieve(URL, 'FremontBridge.csv')
Out[180]:
In [181]:
data = pd.read_csv('FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()
Out[181]:
In [182]:
# rename columns
data.columns = ['West', 'East']
In [183]:
data['Total'] = data.eval('West + East')
In [184]:
data.head()
Out[184]:
In [185]:
# summary stats
data.dropna().describe()
Out[185]:
In [186]:
import seaborn; seaborn.set()
data.plot()
plt.ylabel('Hourly bicycle count');
In [187]:
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('Weekly bicycle count');
In [188]:
daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('mean hourly count');
In [189]:
daily.rolling(50, center=True,
win_type='gaussian').sum(std=10).plot(style=[':', '--', '-']);
In [190]:
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks = hourly_ticks, style=[':', '--', '-'])
plt.ylabel('Average hourly biycle count');
In [191]:
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index=['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-'])
plt.ylabel('Average daily bicycle count');
In [192]:
data.head()
Out[192]:
In [193]:
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()
In [194]:
fig, ax = plt.subplots(1,2, figsize=(14,5))
by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays', xticks=hourly_ticks, style=[':', '--', '-'])
by_time.loc['Weekend'].plot(ax= ax[1], title='Weekends', xticks=hourly_ticks, style=[':', '--', '-']);
In [195]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))
for i in range(5))
In [204]:
result1 = -df1 * df2 / (df3 + df4) - df5
In [203]:
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
In [205]:
np.allclose(result1, result2)
Out[205]:
In [231]:
df = pd.DataFrame(rng.randint(0 , 300, (100,3) ), columns=['A', 'B', 'C'])
df.head()
Out[231]:
In [232]:
r1 = (df['A'] + df['B'] / (df['C'] - 1 ))
In [233]:
r2 = pd.eval('(df.A + df.B / (df.C - 1 ))')
In [234]:
np.allclose(r1, r2)
Out[234]:
In [237]:
%timeit pd.eval('(df.A + df.B / (df.C- 1 ))')
In [238]:
%timeit (df['A'] + df['B'] / (df['C'] - 1 ))
In [ ]: