In [1]:
# imports
from pandas import Series, DataFrame
import pandas as pd
In [2]:
# Series
obj = Series([-4, 3, 6, 6])
obj
Out[2]:
In [3]:
obj.values
Out[3]:
In [4]:
obj.index
Out[4]:
In [5]:
obj2 = Series([2, 5, -5, 3], index=['a', 'b', 'c', 'd'])
obj2
Out[5]:
In [6]:
obj2.values, obj2.index
Out[6]:
In [8]:
# NumPy array operations
obj2['a']
Out[8]:
In [9]:
obj2 * 2
Out[9]:
In [10]:
# Converting Python dict to a series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000,}
obj3 = Series(sdata)
obj3
Out[10]:
In [11]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4
Out[11]:
In [13]:
obj4.dropna()
Out[13]:
In [14]:
obj3 + obj4
Out[14]:
In [15]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame
Out[15]:
In [16]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four', 'five'])
frame2
Out[16]:
In [17]:
frame2['debt'] = 16.5
frame2
Out[17]:
In [18]:
frame2['debt'] = np.arange(5.)
frame2
Out[18]:
In [19]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2
Out[19]:
In [20]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2
Out[20]:
In [21]:
population = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(population)
frame3
Out[21]:
In [22]:
frame3.T
Out[22]:
In [23]:
# input data
# Cannot share data!
df = pd.read_excel('/Users/aditya/PLUG/pandas/chap_04.xls', 'Sheet1')
df
Out[23]:
In [24]:
df.head()
Out[24]:
In [25]:
# Change index and convert string to datetime
df.index = pd.to_datetime(df.pop('date'), format='%Y-%m-%d')
In [26]:
df
Out[26]:
In [27]:
df.head()
Out[27]:
In [28]:
# Monthly mean
mth_mean = df.value.resample('M', how=['mean', 'median', 'std'])
mth_mean.head()
Out[28]:
In [29]:
mth_mean.plot()
Out[29]:
In [30]:
hs_code = df.groupby([df.hs_code, df.index])['value'].mean()
hs_code
Out[30]:
In [31]:
df.from_port.unique()
Out[31]:
In [33]:
grp_from_port = df['value'].groupby(df.from_port).mean()
grp_from_port
Out[33]:
In [34]:
grp_from_port[:10].plot(kind='barh',color='k', alpha=0.7)
Out[34]:
In [35]:
from_port = df.groupby(df.from_port)['value'].mean()
from_port[:10].plot(kind='bar')
Out[35]:
In [36]:
new_df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
new_df
Out[36]:
In [37]:
grouped = new_df['data1'].groupby(new_df['key1'])
grouped
Out[37]:
In [38]:
grouped.mean()
Out[38]:
In [39]:
# https://github.com/pydata/pydata-book/blob/master/ch08/tips.csv
tips = pd.read_csv('/Users/aditya/Repo/pydata-book/ch08/tips.csv')
# tips.describe()
tips
Out[39]:
In [45]:
tips.tail(10)
Out[45]:
In [46]:
# Add tips percentage of total bill
tips['tip_pct'] = tips.tip / tips.total_bill
tips.head()
Out[46]:
In [47]:
grouped = tips.groupby([tips.sex, tips.smoker])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
Out[47]:
In [48]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
Out[48]:
In [49]:
def top(df, n=5, column='tip_pct'):
return df.sort_index(by=column)[-n:]
top(tips, n=10)
Out[49]:
In [50]:
tips.groupby('smoker').apply(top)
Out[50]:
In [51]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
Out[51]:
In [52]:
df = DataFrame(np.random.rand(6, 4),
index=['one', 'two', 'three', 'four', 'five', 'six'],
columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df
Out[52]:
In [53]:
df.plot(kind='bar')
Out[53]:
In [54]:
df.plot(kind='barh', stacked=True, alpha=0.5)
Out[54]:
In [55]:
party_counts = pd.crosstab(tips.day, tips.size)
party_counts
Out[55]:
In [56]:
# Not many parties with size 1(!) and 6
# Eliminating
party_counts = party_counts.ix[:, 2:5]
party_counts
Out[56]:
In [57]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)
party_pcts
Out[57]:
In [58]:
party_pcts.plot(kind='barh', stacked=True, alpha=0.4)
Out[58]:
In [59]:
s = Series(np.random.randn(6))
s
Out[59]:
In [60]:
s[::2] = np.nan
s
Out[60]:
In [61]:
s.fillna(s.mean())
Out[61]:
In [62]:
tips.pivot_table(rows=['sex', 'smoker'])
Out[62]:
In [63]:
# To use the aggregation function, pass it to aggfunc. For example, 'count' or len will give you a
# cross-tabulation (count or frequency) of group sizes
tips.pivot_table('tip_pct', rows=['sex', 'smoker'], cols='day', aggfunc=len, margins=True)
Out[63]: