I ran the following notebook in a docker container with the following commands:
docker run -it -p 8888:8888 -p 6006:6006 -v `pwd`:/space/ -w /space/ --rm --name md waleedka/modern-deep-learning jupyter notebook --ip=0.0.0.0 --allow-root
The following code is adapted from http://pandas.pydata.org/pandas-docs/stable/10min.html
In [1]:
%%bash
pip install seaborn
pip install tables
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
In [3]:
import pandas as pd
In [4]:
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[4]:
In [5]:
dates = pd.date_range('20160101', periods=6)
dates
Out[5]:
In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[6]:
In [7]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["a","b","c", "d"]),
'F' : 'foo' })
df2
Out[7]:
In [8]:
df2.dtypes
Out[8]:
In [9]:
df.index, df.columns, df.values
Out[9]:
In [10]:
df2.index, df2.columns, df2.values
Out[10]:
In [11]:
df.describe()
Out[11]:
In [12]:
df2.describe()
Out[12]:
In [13]:
df.T
Out[13]:
In [14]:
df.sort_index(axis=1, ascending=False)
Out[14]:
In [15]:
df.sort_values(by='B')
Out[15]:
In [16]:
df['A']
Out[16]:
In [17]:
df[0:3]
Out[17]:
In [18]:
df['20160102':'20160104']
Out[18]:
In [19]:
df.loc[dates[2]]
Out[19]:
In [20]:
df.loc[:,['A','D']]
Out[20]:
In [21]:
df.iloc[3:5,0:2]
Out[21]:
In [22]:
df[df.A > 0.5]
Out[22]:
In [23]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2
Out[23]:
In [24]:
df2[df2['E'].isin(['two','four'])]
Out[24]:
In [25]:
df['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20160101', periods=6))
df
Out[25]:
In [26]:
df.at[dates[0],'A'] = 0.456
df.at[dates[0],'A']
Out[26]:
In [27]:
df.iat[0,1] = 0.123
df.iat[0,1]
Out[27]:
In [28]:
df.loc[:,'D'] = np.array([5] * len(df))
df.loc[:,'D']
Out[28]:
In [29]:
df.loc[:,'B':'D'] = np.random.randn(len(df), 3)
df.loc[:,'B':'D']
Out[29]:
In [30]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[:, 'E'] = np.random.randn(len(df1))
df1
Out[30]:
In [31]:
df1.iloc[1,5] = np.nan
df1.dropna(how='any')
Out[31]:
In [32]:
df1.iloc[1,5] = np.nan
df1.fillna(value=5)
Out[32]:
In [33]:
pd.isnull(df1)
Out[33]:
In [34]:
df.median()
Out[34]:
In [35]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
Out[35]:
In [36]:
df.sub(s, axis='index')
Out[36]:
In [37]:
df.apply(np.cumsum)
Out[37]:
In [38]:
a = np.array([[1,2,3], [4,5,6]])
np.cumsum(a,axis=0) # sum over rows for each of the 3 columns
Out[38]:
In [39]:
np.cumsum(a,axis=1) # sum over columns for each of the 2 rows
Out[39]:
In [40]:
df.apply(lambda x: (x.max(), x.min()))
Out[40]:
In [41]:
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()
Out[41]:
In [42]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
Out[42]:
In [43]:
s.str.capitalize()
Out[43]:
In [44]:
s.str.cat()
Out[44]:
In [45]:
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pieces[0]
Out[45]:
In [46]:
pd.concat(pieces)
Out[46]:
In [47]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
Out[47]:
In [48]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
Out[48]:
In [49]:
left = pd.DataFrame({'key': ['foo', 'bar', 'foo'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key': ['foo', 'bar', 'foo'], 'rval': [4, 5, 6]})
joined = pd.merge(left, right, on='key')
joined
Out[49]:
In [50]:
joined[joined.key == 'foo'].lval.sum()
Out[50]:
In [51]:
joined.groupby(by='key').sum()
Out[51]:
In [52]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s = df.iloc[3]
df.append(s, ignore_index=True)
Out[52]:
In [53]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
In [54]:
df.groupby(['A','B']).sum()
Out[54]:
In [55]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two'] * 4]))
tuples
Out[55]:
In [56]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
Out[56]:
In [57]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df
Out[57]:
In [58]:
df.loc['foo', 'one']
Out[58]:
In [59]:
df.loc['foo', :].A
Out[59]:
In [60]:
df2 = df[:4]
stacked = df2.stack()
In [61]:
stacked.unstack()
Out[61]:
In [62]:
stacked.unstack(0)
Out[62]:
In [63]:
stacked.unstack(1)
Out[63]:
In [64]:
stacked.unstack(2)
Out[64]:
In [65]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
Out[65]:
In [66]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[66]:
In [67]:
df = pd.DataFrame(data={'Province' : ['ON','QC','BC','AL','AL','MN','ON'],
'City' : ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'],
'Sales' : [13,6,16,8,4,3,1]})
df
Out[67]:
In [68]:
table = pd.pivot_table(df,values=['Sales'],index=['Province'],columns=['City'],aggfunc=np.sum,margins=True)
table
Out[68]:
In [69]:
table.stack('City')
Out[69]:
In [70]:
rng = pd.date_range('1/1/2016', periods=100, freq='S')
rng[50]
Out[70]:
In [71]:
len(rng)
Out[71]:
In [72]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()
Out[72]:
In [73]:
ts5 = ts.resample('5Min')
In [74]:
ts5.count()
Out[74]:
In [75]:
ts5.median()
Out[75]:
In [76]:
ts.asfreq('10T')
Out[76]:
In [77]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
Out[77]:
In [78]:
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
Out[78]:
In [79]:
df["grade"].cat.categories = ["good", "normal", "bad"]
df
Out[79]:
In [80]:
df.groupby("grade").size()
Out[80]:
In [81]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2016', periods=1000))
ts = ts.cumsum()
ts.plot()
Out[81]:
In [82]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
In [83]:
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
Out[83]:
In [84]:
df.to_csv('/tmp/foo.csv')
In [85]:
pd.read_csv('/tmp/foo.csv')
Out[85]:
In [86]:
df.to_hdf('/tmp/foo.h5','df')