In [1]:
# Pandas
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
obj = Series([3,6,9,12])
obj


Out[2]:
0     3
1     6
2     9
3    12
dtype: int64

In [3]:
obj.values


Out[3]:
array([ 3,  6,  9, 12])

In [4]:
obj.index


Out[4]:
Int64Index([0, 1, 2, 3], dtype='int64')

In [6]:
ww2_cas = Series([8700000,4300000,3000000,2100000,400000], index=['USSR','Germany','China','Japan','USA'])
ww2_cas


Out[6]:
USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [7]:
ww2_cas['USA']


Out[7]:
400000

In [8]:
## Check which countries had casualties greater than 4 mil
ww2_cas[ww2_cas > 4000000]


Out[8]:
USSR       8700000
Germany    4300000
dtype: int64

In [9]:
'USSR' in ww2_cas


Out[9]:
True

In [10]:
ww2_dict = ww2_cas.to_dict()
ww2_dict


Out[10]:
{'China': 3000000,
 'Germany': 4300000,
 'Japan': 2100000,
 'USA': 400000,
 'USSR': 8700000}

In [11]:
ww2_series = Series(ww2_dict)
ww2_series


Out[11]:
China      3000000
Germany    4300000
Japan      2100000
USA         400000
USSR       8700000
dtype: int64

In [14]:
countries = ['China','germany','Japan','USA','USSR']
obj2 = Series(ww2_dict, index = countries)
obj2


Out[14]:
China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
dtype: float64

In [15]:
pd.isnull(obj2)


Out[15]:
China      False
germany     True
Japan      False
USA        False
USSR       False
dtype: bool

In [16]:
pd.notnull(obj2)


Out[16]:
China       True
germany    False
Japan       True
USA         True
USSR        True
dtype: bool

In [17]:
ww2_series + obj2


Out[17]:
China       6000000
Germany         NaN
Japan       4200000
USA          800000
USSR       17400000
germany         NaN
dtype: float64

In [18]:
obj2.name = "World War 2 Casualties"
obj2


Out[18]:
China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
Name: World War 2 Casualties, dtype: float64

In [21]:
obj2.index.name = 'countries'
obj2


Out[21]:
countries
China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
Name: World War 2 Casualties, dtype: float64

In [22]:
# DataFrames
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)


Out[22]:
True

In [23]:
nfl_frame = pd.read_clipboard()

In [24]:
nfl_frame


Out[24]:
Rank Team Won Lost Tied* Pct. First Season Total Games Conference
0 1 Dallas Cowboys 511 378 6 0.574 1960 894 NFC East
1 2 Chicago Bears 752 563 42 0.570 1920 1357 NFC North
2 3 Green Bay Packers 741 561 37 0.567 1921 1339 NFC North
3 4 Miami Dolphins 443 345 4 0.562 1966 792 AFC East
4 5 Baltimore Ravens 182 143 1 0.560 1996 326 AFC North

In [25]:
nfl_frame.columns


Out[25]:
Index(['Rank ', 'Team ', 'Won ', 'Lost ', 'Tied* ', 'Pct. ', 'First Season ',
       'Total Games ', 'Conference'],
      dtype='object')

In [29]:
nfl_frame['Rank ']


Out[29]:
0    1
1    2
2    3
3    4
4    5
Name: Rank , dtype: int64

In [30]:
DataFrame(nfl_frame, columns=['Team ','First Season ', 'Stadium'])


Out[30]:
Team First Season Stadium
0 Dallas Cowboys 1960 NaN
1 Chicago Bears 1920 NaN
2 Green Bay Packers 1921 NaN
3 Miami Dolphins 1966 NaN
4 Baltimore Ravens 1996 NaN

In [31]:
nfl_frame.head(2)


Out[31]:
Rank Team Won Lost Tied* Pct. First Season Total Games Conference
0 1 Dallas Cowboys 511 378 6 0.574 1960 894 NFC East
1 2 Chicago Bears 752 563 42 0.570 1920 1357 NFC North

In [32]:
nfl_frame.ix[3]


Out[32]:
Rank                           4
Team             Miami Dolphins 
Won                          443
Lost                         345
Tied*                          4
Pct.                       0.562
First Season                1966
Total Games                  792
Conference              AFC East
Name: 3, dtype: object

In [33]:
nfl_frame['Stadium'] = "Levi's Stadium"
nfl_frame


Out[33]:
Rank Team Won Lost Tied* Pct. First Season Total Games Conference Stadium
0 1 Dallas Cowboys 511 378 6 0.574 1960 894 NFC East Levi's Stadium
1 2 Chicago Bears 752 563 42 0.570 1920 1357 NFC North Levi's Stadium
2 3 Green Bay Packers 741 561 37 0.567 1921 1339 NFC North Levi's Stadium
3 4 Miami Dolphins 443 345 4 0.562 1966 792 AFC East Levi's Stadium
4 5 Baltimore Ravens 182 143 1 0.560 1996 326 AFC North Levi's Stadium

In [35]:
nfl_frame['Stadium'] = np.arange(5)
stadiums = Series(["Levi's Stadium", "ATT Stadium"], index=[4,0])
stadiums


Out[35]:
4    Levi's Stadium
0       ATT Stadium
dtype: object

In [36]:
nfl_frame['Stadium'] = stadiums
nfl_frame


Out[36]:
Rank Team Won Lost Tied* Pct. First Season Total Games Conference Stadium
0 1 Dallas Cowboys 511 378 6 0.574 1960 894 NFC East ATT Stadium
1 2 Chicago Bears 752 563 42 0.570 1920 1357 NFC North NaN
2 3 Green Bay Packers 741 561 37 0.567 1921 1339 NFC North NaN
3 4 Miami Dolphins 443 345 4 0.562 1966 792 AFC East NaN
4 5 Baltimore Ravens 182 143 1 0.560 1996 326 AFC North Levi's Stadium

In [37]:
del nfl_frame['Stadium']
nfl_frame


Out[37]:
Rank Team Won Lost Tied* Pct. First Season Total Games Conference
0 1 Dallas Cowboys 511 378 6 0.574 1960 894 NFC East
1 2 Chicago Bears 752 563 42 0.570 1920 1357 NFC North
2 3 Green Bay Packers 741 561 37 0.567 1921 1339 NFC North
3 4 Miami Dolphins 443 345 4 0.562 1966 792 AFC East
4 5 Baltimore Ravens 182 143 1 0.560 1996 326 AFC North

In [38]:
data = {'City':['SF','LA','NYC'], 'Population':[837000,3880000,8400000]}

In [39]:
city_frame = DataFrame(data)
city_frame


Out[39]:
City Population
0 SF 837000
1 LA 3880000
2 NYC 8400000

In [40]:
# Index Objects
my_ser = Series([1,2,3,4], index=['A','B','C','D'])
my_ser


Out[40]:
A    1
B    2
C    3
D    4
dtype: int64

In [42]:
my_index = my_ser.index
my_index[2]


Out[42]:
'C'

In [43]:
my_index[2:]


Out[43]:
Index(['C', 'D'], dtype='object')

In [44]:
my_index[0] = 'Z'


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-92ae0de335f8> in <module>()
----> 1 my_index[0] = 'Z'

/usr/lib64/python3.4/site-packages/pandas/core/index.py in __setitem__(self, key, value)
   1055 
   1056     def __setitem__(self, key, value):
-> 1057         raise TypeError("Indexes does not support mutable operations")
   1058 
   1059     def __getitem__(self, key):

TypeError: Indexes does not support mutable operations

In [45]:
# Reindexing
from numpy.random import randn

ser1 = Series([1,2,3,4], index=['A','B','C','D'])
ser1


Out[45]:
A    1
B    2
C    3
D    4
dtype: int64

In [46]:
ser2 = ser1.reindex(['A','B','C','D','E','F'])
ser2


Out[46]:
A     1
B     2
C     3
D     4
E   NaN
F   NaN
dtype: float64

In [48]:
ser2 = ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)
ser2


Out[48]:
A     1
B     2
C     3
D     4
E   NaN
F   NaN
G     0
dtype: float64

In [49]:
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])

In [50]:
ser3


Out[50]:
0        USA
5     Mexico
10    Canada
dtype: object

In [51]:
ranger = range(15)
print(ranger)
ser3.reindex(ranger, method='ffill')


range(0, 15)
Out[51]:
0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [54]:
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])
dframe


Out[54]:
col1 col2 col3 col4 col5
A 0.296343 1.218577 -0.441276 0.616162 -0.532933
B 0.644505 1.593440 -0.492314 -0.251620 -0.317372
D 0.252444 1.333615 -0.597128 0.140199 1.293294
E 1.106515 -2.599082 0.865308 0.730426 0.018708
F 2.674838 -0.331867 0.644406 -0.998384 -1.249751

In [58]:
dframe2 = dframe.reindex(['A','B','C','D','E','F'])
dframe2


Out[58]:
col1 col2 col3 col4 col5
A 0.296343 1.218577 -0.441276 0.616162 -0.532933
B 0.644505 1.593440 -0.492314 -0.251620 -0.317372
C NaN NaN NaN NaN NaN
D 0.252444 1.333615 -0.597128 0.140199 1.293294
E 1.106515 -2.599082 0.865308 0.730426 0.018708
F 2.674838 -0.331867 0.644406 -0.998384 -1.249751

In [62]:
new_columns= ['col1','col2','col3','col4','col5','col6']
dframe2.reindex(columns=new_columns)
dframe2


Out[62]:
col1 col2 col3 col4 col5
A 0.296343 1.218577 -0.441276 0.616162 -0.532933
B 0.644505 1.593440 -0.492314 -0.251620 -0.317372
C NaN NaN NaN NaN NaN
D 0.252444 1.333615 -0.597128 0.140199 1.293294
E 1.106515 -2.599082 0.865308 0.730426 0.018708
F 2.674838 -0.331867 0.644406 -0.998384 -1.249751

In [63]:
dframe.ix[['A','B','C','D','E','F'], new_columns]


Out[63]:
col1 col2 col3 col4 col5 col6
A 0.296343 1.218577 -0.441276 0.616162 -0.532933 NaN
B 0.644505 1.593440 -0.492314 -0.251620 -0.317372 NaN
C NaN NaN NaN NaN NaN NaN
D 0.252444 1.333615 -0.597128 0.140199 1.293294 NaN
E 1.106515 -2.599082 0.865308 0.730426 0.018708 NaN
F 2.674838 -0.331867 0.644406 -0.998384 -1.249751 NaN

In [64]:
# Drop Entry
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1


Out[64]:
a    0
b    1
c    2
dtype: int64

In [65]:
ser1.drop('b')


Out[65]:
a    0
c    2
dtype: int64

In [66]:
dframe1 = DataFrame(np.arange(9).reshape(3,3),index=['sf','la','ny'],columns=['pop','size','year'])
dframe1


Out[66]:
pop size year
sf 0 1 2
la 3 4 5
ny 6 7 8

In [70]:
dframe2 = dframe1.drop('la')
print(dframe1)
dframe2


    pop  size  year
sf    0     1     2
la    3     4     5
ny    6     7     8
Out[70]:
pop size year
sf 0 1 2
ny 6 7 8

In [71]:
dframe1.drop('year',axis=1)


Out[71]:
pop size
sf 0 1
la 3 4
ny 6 7

In [72]:
# Selecting Entries
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1 = 2*ser1
ser1


Out[72]:
a    0
b    2
c    4
dtype: int64

In [74]:
ser1['b']


Out[74]:
2

In [75]:
ser1[1]


Out[75]:
2

In [76]:
ser1[0:3]


Out[76]:
a    0
b    2
c    4
dtype: int64

In [77]:
ser1[:]


Out[77]:
a    0
b    2
c    4
dtype: int64

In [78]:
ser1[['a','c']]


Out[78]:
a    0
c    4
dtype: int64

In [79]:
ser1[ser1>3]


Out[79]:
c    4
dtype: int64

In [80]:
ser1[ser1>3] = 10

In [81]:
ser1


Out[81]:
a     0
b     2
c    10
dtype: int64

In [82]:
dframe = DataFrame(np.arange(25).reshape((5,5)),index=['nyc','sf','la','dc','chi'],columns=['a','b','c','d','e'])
dframe


Out[82]:
a b c d e
nyc 0 1 2 3 4
sf 5 6 7 8 9
la 10 11 12 13 14
dc 15 16 17 18 19
chi 20 21 22 23 24

In [83]:
dframe[dframe['c']>8]


Out[83]:
a b c d e
la 10 11 12 13 14
dc 15 16 17 18 19
chi 20 21 22 23 24

In [84]:
dframe > 10


Out[84]:
a b c d e
nyc False False False False False
sf False False False False False
la False True True True True
dc True True True True True
chi True True True True True

In [85]:
dframe.ix['la'] # ix is kinda like a transpose


Out[85]:
a    10
b    11
c    12
d    13
e    14
Name: la, dtype: int64

In [86]:
# Data Alignment
ser1 = Series([0,1,2],index=['a','b','c'])
ser1


Out[86]:
a    0
b    1
c    2
dtype: int64

In [87]:
ser2 = Series([3,4,5,6],index=['a','b','c','d'])
ser2


Out[87]:
a    3
b    4
c    5
d    6
dtype: int64

In [88]:
ser1+ser2


Out[88]:
a     3
b     5
c     7
d   NaN
dtype: float64

In [91]:
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns=list('AB'),index=['nyc','la'])
dframe1


Out[91]:
A B
nyc 0 1
la 2 3

In [92]:
dframe2 = DataFrame(np.arange(9).reshape((3,3)),columns=list('ADC'),index=['nyc','la','sf'])
dframe2


Out[92]:
A D C
nyc 0 1 2
la 3 4 5
sf 6 7 8

In [93]:
dframe1+dframe2


Out[93]:
A B C D
la 5 NaN NaN NaN
nyc 0 NaN NaN NaN
sf NaN NaN NaN NaN

In [94]:
dframe1.add(dframe2,fill_value=0)


Out[94]:
A B C D
la 5 3 5 4
nyc 0 1 2 1
sf 6 NaN 8 7

In [96]:
ser3 = dframe2.ix[0]
ser3


Out[96]:
A    0
D    1
C    2
Name: nyc, dtype: int64

In [97]:
dframe2-ser3


Out[97]:
A D C
nyc 0 0 0
la 3 3 3
sf 6 6 6

In [99]:
# Ranking and Sorting
ser1 = Series(range(3),index=['c','a','e'])
ser1


Out[99]:
c    0
a    1
e    2
dtype: int64

In [100]:
ser1.sort_index() # by index


Out[100]:
a    1
c    0
e    2
dtype: int64

In [101]:
ser1.order() # by value


Out[101]:
c    0
a    1
e    2
dtype: int64

In [103]:
ser2 = Series(randn(10))
ser2


Out[103]:
0    0.269250
1    0.071230
2   -1.713685
3   -0.700515
4    0.698345
5   -0.751510
6    0.012405
7   -0.373998
8   -0.074376
9   -1.200380
dtype: float64

In [104]:
ser2.sort()
ser2.rank()


Out[104]:
2     1
9     2
5     3
3     4
7     5
8     6
6     7
1     8
0     9
4    10
dtype: float64

In [105]:
ser3 = Series(randn(10))
ser3.rank()


Out[105]:
0     8
1     4
2     7
3     1
4     9
5     3
6     5
7    10
8     6
9     2
dtype: float64

In [106]:
ser3.sort()
ser3.rank()


Out[106]:
3     1
9     2
5     3
1     4
6     5
8     6
2     7
0     8
4     9
7    10
dtype: float64

In [107]:
# Summary Statistics
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
dframe1 = DataFrame(arr,index=['A','B'],columns=['one','two','three'])
dframe1


Out[107]:
one two three
A 1 2 NaN
B NaN 3 4

In [108]:
dframe1.sum() # just ignores null


Out[108]:
one      1
two      5
three    4
dtype: float64

In [109]:
dframe1.sum(axis=1)


Out[109]:
A    3
B    7
dtype: float64

In [110]:
dframe1.min()


Out[110]:
one      1
two      2
three    4
dtype: float64

In [111]:
dframe1.idxmin()


Out[111]:
one      A
two      A
three    B
dtype: object

In [112]:
dframe1.cumsum() # adds columns downward (col2 row2 changes here)


Out[112]:
one two three
A 1 2 NaN
B NaN 5 4

In [113]:
dframe1.describe() # Some basic descriptive statistics


Out[113]:
one two three
count 1 2.000000 1
mean 1 2.500000 4
std NaN 0.707107 NaN
min 1 2.000000 4
25% 1 2.250000 4
50% 1 2.500000 4
75% 1 2.750000 4
max 1 3.000000 4

In [114]:
from IPython.display import YouTubeVideo

In [115]:
YouTubeVideo('xGbpuFNR1ME') # Covariance Video


Out[115]:

In [116]:
YouTubeVideo('4EXNedimDMs') # Correlation video


Out[116]:

In [118]:
import pandas.io.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Adj Close']
prices.head()


Out[118]:
BP CVX XOM
Date
2010-01-04 45.569012 63.917128 58.703696
2010-01-05 45.892578 64.369872 58.932905
2010-01-06 46.131402 64.377952 59.442264
2010-01-07 46.115994 64.135417 59.255504
2010-01-08 46.223849 64.248601 59.017797

In [119]:
volume = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Volume']

In [120]:
volume.head()


Out[120]:
BP CVX XOM
Date
2010-01-04 3956100 10173800 27809100
2010-01-05 4109600 10593700 30174700
2010-01-06 6227900 11014600 35044700
2010-01-07 4431300 9626900 27192100
2010-01-08 3786100 5624300 24891800

In [121]:
rets = prices.pct_change()
corr = rets.corr # Correlation of the stocks

In [122]:
%matplotlib inline
prices.plot()


Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3aa86d9940>

In [124]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.corrplot(rets,annot=False, diag_names=False)


/usr/lib64/python3.4/site-packages/seaborn/linearmodels.py:1283: UserWarning: The `corrplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `corrplot` function has been deprecated in favor "
/usr/lib64/python3.4/site-packages/seaborn/linearmodels.py:1349: UserWarning: The `symmatplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `symmatplot` function has been deprecated in favor "
Out[124]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3aa15c60f0>

In [128]:
ser1 = Series(['w','w','x','y','z','w','a'])
ser1.unique()


Out[128]:
array(['w', 'x', 'y', 'z', 'a'], dtype=object)

In [129]:
# Missing Data
data = Series(['one','two',np.nan,'four'])

In [130]:
data


Out[130]:
0     one
1     two
2     NaN
3    four
dtype: object

In [131]:
data.isnull()


Out[131]:
0    False
1    False
2     True
3    False
dtype: bool

In [132]:
data.dropna() # Drops null values


Out[132]:
0     one
1     two
3    four
dtype: object

In [133]:
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe


Out[133]:
0 1 2
0 1 2 3
1 NaN 5 6
2 7 NaN 9
3 NaN NaN NaN

In [134]:
clean_dframe = dframe.dropna()
clean_dframe


Out[134]:
0 1 2
0 1 2 3

In [135]:
dframe.dropna(how='all') # Drops rows with all nulls


Out[135]:
0 1 2
0 1 2 3
1 NaN 5 6
2 7 NaN 9

In [136]:
dframe.dropna(axis=1) # Drops cols with nulls


Out[136]:
0
1
2
3

In [137]:
npn = np.nan
dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])
dframe2


Out[137]:
0 1 2 3
0 1 2 3 NaN
1 2 NaN 5 6
2 NaN 7 NaN 9
3 1 NaN NaN NaN

In [138]:
dframe2.dropna(thresh=2)


Out[138]:
0 1 2 3
0 1 2 3 NaN
1 2 NaN 5 6
2 NaN 7 NaN 9

In [139]:
dframe2.fillna(1)


Out[139]:
0 1 2 3
0 1 2 3 1
1 2 1 5 6
2 1 7 1 9
3 1 1 1 1

In [140]:
dframe2.fillna({0:0,1:1,2:2,3:3})


Out[140]:
0 1 2 3
0 1 2 3 3
1 2 1 5 6
2 0 7 2 9
3 1 1 2 3

In [141]:
dframe2


Out[141]:
0 1 2 3
0 1 2 3 NaN
1 2 NaN 5 6
2 NaN 7 NaN 9
3 1 NaN NaN NaN

In [142]:
# Index Heirarchy
ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']])

In [143]:
ser


Out[143]:
1  a    0.773747
   b   -0.787147
   c   -0.814421
2  a    1.011292
   b   -2.150522
   c   -1.224728
dtype: float64

In [144]:
ser.index


Out[144]:
MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [145]:
ser[:,'a']


Out[145]:
1    0.773747
2    1.011292
dtype: float64

In [146]:
dframe = ser.unstack()

In [147]:
dframe


Out[147]:
a b c
1 0.773747 -0.787147 -0.814421
2 1.011292 -2.150522 -1.224728

In [ ]: