Windows
python -m pip install -U pip
Linux
pip install -U pip
In [67]:
!pip install wheel
In [221]:
%matplotlib inline
from pandas import *
from pylab import *
import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML
from __future__ import division
def side_by_side(*objs, **kwds):
from pandas.formats.printing import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
def htmldisp(x):
try:
return HTML(pd.DataFrame(x).to_html())
except:
print ("Alert!! \nInput should be a Pandas obj.")
plt.rc('figure', figsize=(10, 6))
# pandas.set_printoptions(notebook_repr_html=False)
In [69]:
plt.plot([1,2,4,8,16])
Out[69]:
In [70]:
np.random.randn(5)
Out[70]:
In [71]:
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)
In [72]:
'b' in s
Out[72]:
In [73]:
s['b']
Out[73]:
In [74]:
s.index
Out[74]:
In [75]:
s
Out[75]:
In [76]:
mapping = s.to_dict()
mapping
Out[76]:
In [77]:
s = pd.Series(mapping, index=['b', 'e', 'a', 'd', 'f'])
s
Out[77]:
In [78]:
notnull(s)
Out[78]:
In [79]:
s[notnull(s)]
Out[79]:
In [80]:
s.dropna()
Out[80]:
In [81]:
s * 2
Out[81]:
In [82]:
s[3:]
Out[82]:
In [83]:
s[:3]
Out[83]:
In [84]:
s.index
Out[84]:
In [85]:
df = DataFrame({'a': np.random.randn(6),
'b': ['foo', 'bar'] * 3,
'c': np.random.randn(6)})
df['d'] = range(6)
df
Out[85]:
In [86]:
df['b'] # by column
Out[86]:
In [87]:
df[:3]
Out[87]:
In [88]:
df[:-2]
Out[88]:
In [89]:
df[-2:] # the last 2 rows
Out[89]:
In [90]:
df[['a','b']]
Out[90]:
In [91]:
np.tile(['foo', 'bar'], 3) # in numpy
Out[91]:
In [92]:
df.xs(0) # returning the first row
Out[92]:
In [93]:
df.ix[2] # does equal thing as above. In this case, 3rd row.
Out[93]:
In [94]:
df.ix[2, 'b']
Out[94]:
In [95]:
timeit df.ix[1]
In [96]:
df.get_value(2, 'b')
Out[96]:
In [97]:
timeit df.ix[2, 'b']
In [98]:
timeit df.get_value(2, 'b')
In [99]:
df.ix[2:4, 'b']
Out[99]:
In [100]:
df.ix[2:4, ['b', 'c']]
Out[100]:
In [101]:
df.ix[2:4, 'b':'c'] # slices by column - essentially the same as above.
Out[101]:
In [102]:
df.ix[2:4, 0:2]
Out[102]:
In [103]:
df.ix[[0, 2, 4], ['b', 'c', 'd']] # pass a list of rows and columns I want to select out
Out[103]:
In [104]:
df['c'] > 0
Out[104]:
In [105]:
df.ix[df['c'] > 0] # boolean arrays
Out[105]:
In [106]:
df.index
Out[106]:
In [107]:
df.columns
Out[107]:
In [108]:
pandas.date_range('1/1/2000',periods=6)
Out[108]:
In [109]:
df = DataFrame({'a': np.random.randn(6),
'b': ['foo', 'bar'] * 3,
'c': np.random.randn(6)},
index=pandas.date_range('1/1/2000', periods=6))
df
Out[109]:
In [110]:
df = DataFrame({'a': np.random.randn(6),
'b': ['foo', 'bar'] * 3,
'c': np.random.randn(6)},
columns=['a', 'b', 'c', 'd'])
df
Out[110]:
In [111]:
isnull(df)
Out[111]:
In [112]:
data = {}
for col in ['foo', 'bar', 'baz']:
for row in ['a', 'b', 'c', 'd']:
data.setdefault(col, {})[row] = np.random.randn() # sorted cols: bar, baz, foo
data
Out[112]:
In [113]:
del data ['foo']['c'] # delete an entry by index key (col) and row.
In [114]:
DataFrame(data) # pass a dict
Out[114]:
In [115]:
close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)
In [172]:
!head -n 10 stock_data.csv
In [117]:
!head stock_data.csv
In [118]:
close_px
Out[118]:
In [119]:
s1 = close_px['AAPL'][-20:] # the last 20 rows
s2 = close_px['AAPL'][-25:-10] # the last 25 - last 10 = 15 rows.
side_by_side(s1, s2)
In [120]:
s1 + s2
Out[120]:
In [121]:
s1.add(s2, fill_value=0)
Out[121]:
In [122]:
(s1 + s2).dropna()
Out[122]:
In [123]:
df = close_px.ix[-10:, :3]
df
Out[123]:
In [124]:
side_by_side(s1.reindex(s2.index), s2) # align indexes of s1 and s2. Print 2 objects alongside each other.
In [125]:
side_by_side(s1.ix[s2.index], s2) # same as above, supported in newer version of 0.12 ipython.
In [126]:
b, c = s1.align(s2, join='inner')
side_by_side(b, c)
In [127]:
b, c = s1.align(s2, join='outer')
side_by_side(b, c)
b, c = s1.align(s2, join='right') side_by_side(b, c)
In [128]:
df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]
df
Out[128]:
In [129]:
df2 = df.ix[::2, ['IBM', 'MSFT']]
side_by_side(df, df2)
In [130]:
df + df2
Out[130]:
In [131]:
b, c = df.align(df2, join='inner')
side_by_side(b, c)
In [132]:
df[:5].T
Out[132]:
In [133]:
n = 10
foo = DataFrame(index=range(n))
foo['floats'] = np.random.randn(n)
foo['ints'] = np.arange(n)
foo['strings'] = ['foo', 'bar'] * (n / 2)
foo['bools'] = foo['floats'] > 0
foo['objects'] = pandas.date_range('1/1/2000', periods=n)
foo
Out[133]:
In [134]:
foo.dtypes
Out[134]:
N.B. transposing is not roundtrippable in this case (column-oriented data structure)
In [135]:
foo.T.T
Out[135]:
In [136]:
foo.T.T.dtypes
Out[136]:
In [137]:
df
Out[137]:
In [138]:
df.apply(np.mean)
Out[138]:
In [139]:
df.mean() # same as above. df.mean? to look up the function usage.
Out[139]:
In [140]:
df.mean(1) # get the mean of the rows based on a column. Also excludes missing data (NaN: not a number).
Out[140]:
In [141]:
df.mean(1, skipna=False) # flag skipna: to include NaN. In this data, no NaN.
Out[141]:
In [142]:
df.apply(np.mean, axis=1) # same as above.
Out[142]:
You can get as fancy as you want
In [143]:
close_px # Output: 1000 rows.
Out[143]:
In [144]:
close_px.AAPL # For easier typing, after dot ., use tab to select the name of the stock.
Out[144]:
In [145]:
close_px.AAPL.idxmax() # gives index of its maximum value.
Out[145]:
In [146]:
close_px.AAPL.index[1]
Out[146]:
In [180]:
close_px.AAPL.index[close_px.AAPL.max()] # If you are numpy aficionado, does the same thing.
Out[180]:
In [148]:
close_px.AA.idxmax()
def peak_date(series):
return series.idxmax()
In [149]:
close_px.apply(peak_date)
Out[149]:
In [150]:
for column in close_px:
print close_px[column].max()
In [151]:
def peak_date(series):
return series.idxmax() # new method: series.idxmax()
close_px.apply(peak_date, axis=0)
Out[151]:
In [152]:
df.apply(lambda x: x.max())
Out[152]:
In [153]:
df.apply(lambda x: x.max() - x.min()) # np.ptp
Out[153]:
In [154]:
np.log(close_px)
Out[154]:
In [155]:
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
hdf = DataFrame(np.random.randn(10, 3), index=index,
columns=['A', 'B', 'C'])
hdf
Out[155]:
In [156]:
hdf.ix['foo']
Out[156]:
In [157]:
hdf.ix['foo'] = 0
hdf
Out[157]:
In [158]:
hdf.ix['bar','one']['A']
Out[158]:
In [159]:
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
('B', 'cat'), ('A', 'dog')])
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df
Out[159]:
In [160]:
df2 = df.ix[[0, 1, 2, 4, 5, 7]]
df2
Out[160]:
In [161]:
df.unstack()['B']
Out[161]:
In [162]:
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
Out[162]:
In [163]:
for key, group in df.groupby('A'):
print key
print group
In [164]:
df.groupby('A')['C'].describe().T
Out[164]:
In [165]:
df.groupby('A').mean()
Out[165]:
In [166]:
for key, group in df.groupby('A'):
print key
print group
In [167]:
df.groupby(['A', 'B']).mean()
Out[167]:
In [168]:
#df.groupby(['A', 'B'], as_index=False).mean()
In [169]:
df.stack()
Out[169]:
In [170]:
#df.stack().mean(1).unstack()
In [171]:
#df.groupby(level=1, axis=1).mean()
In [185]:
movs = pd.read_csv('movie_metadata.csv')
movs.columns
movs.columns = [c.replace("_", " ") for c in movs.columns]
movs.columns
Out[185]:
In [238]:
_movs = movs[movs.title_year.ge(2000) & movs.country.isin(['USA', 'UK'])]
_df = pd.crosstab(_movs.title_year, _movs.country)
_df
#_df = pd.crosstab(_movs.title_year, _movs.country, values=_movs.actor_1_facebook_likes, aggfunc=[sum, mean, median])
#_df
#pd.crosstab(_movs.title_year, _movs.country).apply(lambda r: (r/r.sum())*100, axis=1)
Out[238]:
In [231]:
movs[['imdb_score']].hist()
Out[231]:
In [241]:
### IPython Gyan!
%who DataFrame
In [240]:
%whos DataFrame
In [239]:
%who_ls DataFrame
Out[239]:
In [247]:
sal = pd.read_csv(r'Salaries.csv')
In [311]:
sal['EmpFirstLetter'] = sal.EmployeeName.str[:1]
sal.pivot_table(index=['EmpFirstLetter'], columns=['Year'], values=['TotalPay'], aggfunc=[median, mean])
Out[311]:
In [314]:
sal['PayMark'] = 'More than 100k'
sal.loc[sal.TotalPay.le(100000), 'PayMark'] = 'Less than 100k'
sal.PayMark.unique()
Out[314]:
In [316]:
sal.groupby('PayMark')['Id'].count()
#sal.PayMark.value_counts()
Out[316]:
In [2]:
import sys
sys.version
Out[2]:
In [3]:
import pandas as pd
pd.__version__
Out[3]: