notebook.community

Edit and run



In [1]:

    
# Pandas
import numpy as np
import pandas as pd
from pandas import Series, DataFrame



In [2]:

    
obj = Series([3,6,9,12])
obj









    Out[2]:





0     3
1     6
2     9
3    12
dtype: int64



In [3]:

    
obj.values









    Out[3]:





array([ 3,  6,  9, 12])



In [4]:

    
obj.index









    Out[4]:





Int64Index([0, 1, 2, 3], dtype='int64')



In [6]:

    
ww2_cas = Series([8700000,4300000,3000000,2100000,400000], index=['USSR','Germany','China','Japan','USA'])
ww2_cas









    Out[6]:





USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64



In [7]:

    
ww2_cas['USA']









    Out[7]:





400000



In [8]:

    
## Check which countries had casualties greater than 4 mil
ww2_cas[ww2_cas > 4000000]









    Out[8]:





USSR       8700000
Germany    4300000
dtype: int64



In [9]:

    
'USSR' in ww2_cas









    Out[9]:





True



In [10]:

    
ww2_dict = ww2_cas.to_dict()
ww2_dict









    Out[10]:





{'China': 3000000,
 'Germany': 4300000,
 'Japan': 2100000,
 'USA': 400000,
 'USSR': 8700000}



In [11]:

    
ww2_series = Series(ww2_dict)
ww2_series









    Out[11]:





China      3000000
Germany    4300000
Japan      2100000
USA         400000
USSR       8700000
dtype: int64



In [14]:

    
countries = ['China','germany','Japan','USA','USSR']
obj2 = Series(ww2_dict, index = countries)
obj2









    Out[14]:





China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
dtype: float64



In [15]:

    
pd.isnull(obj2)









    Out[15]:





China      False
germany     True
Japan      False
USA        False
USSR       False
dtype: bool



In [16]:

    
pd.notnull(obj2)









    Out[16]:





China       True
germany    False
Japan       True
USA         True
USSR        True
dtype: bool



In [17]:

    
ww2_series + obj2









    Out[17]:





China       6000000
Germany         NaN
Japan       4200000
USA          800000
USSR       17400000
germany         NaN
dtype: float64



In [18]:

    
obj2.name = "World War 2 Casualties"
obj2









    Out[18]:





China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
Name: World War 2 Casualties, dtype: float64



In [21]:

    
obj2.index.name = 'countries'
obj2









    Out[21]:





countries
China      3000000
germany        NaN
Japan      2100000
USA         400000
USSR       8700000
Name: World War 2 Casualties, dtype: float64



In [22]:

    
# DataFrames
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)









    Out[22]:





True



In [23]:

    
nfl_frame = pd.read_clipboard()



In [24]:

    
nfl_frame









    Out[24]:






  
    
      
      Rank
      Team
      Won
      Lost
      Tied*
      Pct.
      First Season
      Total Games
      Conference
    
  
  
    
      0
      1
      Dallas Cowboys
      511
      378
      6
      0.574
      1960
      894
      NFC East
    
    
      1
      2
      Chicago Bears
      752
      563
      42
      0.570
      1920
      1357
      NFC North
    
    
      2
      3
      Green Bay Packers
      741
      561
      37
      0.567
      1921
      1339
      NFC North
    
    
      3
      4
      Miami Dolphins
      443
      345
      4
      0.562
      1966
      792
      AFC East
    
    
      4
      5
      Baltimore Ravens
      182
      143
      1
      0.560
      1996
      326
      AFC North



In [25]:

    
nfl_frame.columns









    Out[25]:





Index(['Rank ', 'Team ', 'Won ', 'Lost ', 'Tied* ', 'Pct. ', 'First Season ',
       'Total Games ', 'Conference'],
      dtype='object')



In [29]:

    
nfl_frame['Rank ']









    Out[29]:





0    1
1    2
2    3
3    4
4    5
Name: Rank , dtype: int64



In [30]:

    
DataFrame(nfl_frame, columns=['Team ','First Season ', 'Stadium'])









    Out[30]:






  
    
      
      Team
      First Season
      Stadium
    
  
  
    
      0
      Dallas Cowboys
      1960
      NaN
    
    
      1
      Chicago Bears
      1920
      NaN
    
    
      2
      Green Bay Packers
      1921
      NaN
    
    
      3
      Miami Dolphins
      1966
      NaN
    
    
      4
      Baltimore Ravens
      1996
      NaN



In [31]:

    
nfl_frame.head(2)









    Out[31]:






  
    
      
      Rank
      Team
      Won
      Lost
      Tied*
      Pct.
      First Season
      Total Games
      Conference
    
  
  
    
      0
      1
      Dallas Cowboys
      511
      378
      6
      0.574
      1960
      894
      NFC East
    
    
      1
      2
      Chicago Bears
      752
      563
      42
      0.570
      1920
      1357
      NFC North



In [32]:

    
nfl_frame.ix[3]









    Out[32]:





Rank                           4
Team             Miami Dolphins 
Won                          443
Lost                         345
Tied*                          4
Pct.                       0.562
First Season                1966
Total Games                  792
Conference              AFC East
Name: 3, dtype: object



In [33]:

    
nfl_frame['Stadium'] = "Levi's Stadium"
nfl_frame









    Out[33]:






  
    
      
      Rank
      Team
      Won
      Lost
      Tied*
      Pct.
      First Season
      Total Games
      Conference
      Stadium
    
  
  
    
      0
      1
      Dallas Cowboys
      511
      378
      6
      0.574
      1960
      894
      NFC East
      Levi's Stadium
    
    
      1
      2
      Chicago Bears
      752
      563
      42
      0.570
      1920
      1357
      NFC North
      Levi's Stadium
    
    
      2
      3
      Green Bay Packers
      741
      561
      37
      0.567
      1921
      1339
      NFC North
      Levi's Stadium
    
    
      3
      4
      Miami Dolphins
      443
      345
      4
      0.562
      1966
      792
      AFC East
      Levi's Stadium
    
    
      4
      5
      Baltimore Ravens
      182
      143
      1
      0.560
      1996
      326
      AFC North
      Levi's Stadium



In [35]:

    
nfl_frame['Stadium'] = np.arange(5)
stadiums = Series(["Levi's Stadium", "ATT Stadium"], index=[4,0])
stadiums









    Out[35]:





4    Levi's Stadium
0       ATT Stadium
dtype: object



In [36]:

    
nfl_frame['Stadium'] = stadiums
nfl_frame









    Out[36]:






  
    
      
      Rank
      Team
      Won
      Lost
      Tied*
      Pct.
      First Season
      Total Games
      Conference
      Stadium
    
  
  
    
      0
      1
      Dallas Cowboys
      511
      378
      6
      0.574
      1960
      894
      NFC East
      ATT Stadium
    
    
      1
      2
      Chicago Bears
      752
      563
      42
      0.570
      1920
      1357
      NFC North
      NaN
    
    
      2
      3
      Green Bay Packers
      741
      561
      37
      0.567
      1921
      1339
      NFC North
      NaN
    
    
      3
      4
      Miami Dolphins
      443
      345
      4
      0.562
      1966
      792
      AFC East
      NaN
    
    
      4
      5
      Baltimore Ravens
      182
      143
      1
      0.560
      1996
      326
      AFC North
      Levi's Stadium



In [37]:

    
del nfl_frame['Stadium']
nfl_frame









    Out[37]:






  
    
      
      Rank
      Team
      Won
      Lost
      Tied*
      Pct.
      First Season
      Total Games
      Conference
    
  
  
    
      0
      1
      Dallas Cowboys
      511
      378
      6
      0.574
      1960
      894
      NFC East
    
    
      1
      2
      Chicago Bears
      752
      563
      42
      0.570
      1920
      1357
      NFC North
    
    
      2
      3
      Green Bay Packers
      741
      561
      37
      0.567
      1921
      1339
      NFC North
    
    
      3
      4
      Miami Dolphins
      443
      345
      4
      0.562
      1966
      792
      AFC East
    
    
      4
      5
      Baltimore Ravens
      182
      143
      1
      0.560
      1996
      326
      AFC North



In [38]:

    
data = {'City':['SF','LA','NYC'], 'Population':[837000,3880000,8400000]}



In [39]:

    
city_frame = DataFrame(data)
city_frame



In [40]:

    
# Index Objects
my_ser = Series([1,2,3,4], index=['A','B','C','D'])
my_ser









    Out[40]:





A    1
B    2
C    3
D    4
dtype: int64



In [42]:

    
my_index = my_ser.index
my_index[2]









    Out[42]:





'C'



In [43]:

    
my_index[2:]









    Out[43]:





Index(['C', 'D'], dtype='object')



In [44]:

    
my_index[0] = 'Z'









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-92ae0de335f8> in <module>()
----> 1 my_index[0] = 'Z'

/usr/lib64/python3.4/site-packages/pandas/core/index.py in __setitem__(self, key, value)
   1055 
   1056     def __setitem__(self, key, value):
-> 1057         raise TypeError("Indexes does not support mutable operations")
   1058 
   1059     def __getitem__(self, key):

TypeError: Indexes does not support mutable operations



In [45]:

    
# Reindexing
from numpy.random import randn

ser1 = Series([1,2,3,4], index=['A','B','C','D'])
ser1









    Out[45]:





A    1
B    2
C    3
D    4
dtype: int64



In [46]:

    
ser2 = ser1.reindex(['A','B','C','D','E','F'])
ser2









    Out[46]:





A     1
B     2
C     3
D     4
E   NaN
F   NaN
dtype: float64



In [48]:

    
ser2 = ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)
ser2









    Out[48]:





A     1
B     2
C     3
D     4
E   NaN
F   NaN
G     0
dtype: float64



In [49]:

    
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])



In [50]:

    
ser3









    Out[50]:





0        USA
5     Mexico
10    Canada
dtype: object



In [51]:

    
ranger = range(15)
print(ranger)
ser3.reindex(ranger, method='ffill')









    



range(0, 15)






    Out[51]:





0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object



In [54]:

    
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])
dframe



In [58]:

    
dframe2 = dframe.reindex(['A','B','C','D','E','F'])
dframe2



In [62]:

    
new_columns= ['col1','col2','col3','col4','col5','col6']
dframe2.reindex(columns=new_columns)
dframe2



In [63]:

    
dframe.ix[['A','B','C','D','E','F'], new_columns]



In [64]:

    
# Drop Entry
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1









    Out[64]:





a    0
b    1
c    2
dtype: int64



In [65]:

    
ser1.drop('b')









    Out[65]:





a    0
c    2
dtype: int64



In [66]:

    
dframe1 = DataFrame(np.arange(9).reshape(3,3),index=['sf','la','ny'],columns=['pop','size','year'])
dframe1



In [70]:

    
dframe2 = dframe1.drop('la')
print(dframe1)
dframe2









    



    pop  size  year
sf    0     1     2
la    3     4     5
ny    6     7     8






    Out[70]:






  
    
      
      pop
      size
      year
    
  
  
    
      sf
      0
      1
      2
    
    
      ny
      6
      7
      8



In [71]:

    
dframe1.drop('year',axis=1)



In [72]:

    
# Selecting Entries
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1 = 2*ser1
ser1









    Out[72]:





a    0
b    2
c    4
dtype: int64



In [74]:

    
ser1['b']









    Out[74]:





2



In [75]:

    
ser1[1]









    Out[75]:





2



In [76]:

    
ser1[0:3]









    Out[76]:





a    0
b    2
c    4
dtype: int64



In [77]:

    
ser1[:]









    Out[77]:





a    0
b    2
c    4
dtype: int64



In [78]:

    
ser1[['a','c']]









    Out[78]:





a    0
c    4
dtype: int64



In [79]:

    
ser1[ser1>3]









    Out[79]:





c    4
dtype: int64



In [80]:

    
ser1[ser1>3] = 10



In [81]:

    
ser1









    Out[81]:





a     0
b     2
c    10
dtype: int64



In [82]:

    
dframe = DataFrame(np.arange(25).reshape((5,5)),index=['nyc','sf','la','dc','chi'],columns=['a','b','c','d','e'])
dframe



In [83]:

    
dframe[dframe['c']>8]



In [84]:

    
dframe > 10









    Out[84]:






  
    
      
      a
      b
      c
      d
      e
    
  
  
    
      nyc
      False
      False
      False
      False
      False
    
    
      sf
      False
      False
      False
      False
      False
    
    
      la
      False
      True
      True
      True
      True
    
    
      dc
      True
      True
      True
      True
      True
    
    
      chi
      True
      True
      True
      True
      True



In [85]:

    
dframe.ix['la'] # ix is kinda like a transpose









    Out[85]:





a    10
b    11
c    12
d    13
e    14
Name: la, dtype: int64



In [86]:

    
# Data Alignment
ser1 = Series([0,1,2],index=['a','b','c'])
ser1









    Out[86]:





a    0
b    1
c    2
dtype: int64



In [87]:

    
ser2 = Series([3,4,5,6],index=['a','b','c','d'])
ser2









    Out[87]:





a    3
b    4
c    5
d    6
dtype: int64



In [88]:

    
ser1+ser2









    Out[88]:





a     3
b     5
c     7
d   NaN
dtype: float64



In [91]:

    
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns=list('AB'),index=['nyc','la'])
dframe1



In [92]:

    
dframe2 = DataFrame(np.arange(9).reshape((3,3)),columns=list('ADC'),index=['nyc','la','sf'])
dframe2



In [93]:

    
dframe1+dframe2



In [94]:

    
dframe1.add(dframe2,fill_value=0)



In [96]:

    
ser3 = dframe2.ix[0]
ser3









    Out[96]:





A    0
D    1
C    2
Name: nyc, dtype: int64



In [97]:

    
dframe2-ser3



In [99]:

    
# Ranking and Sorting
ser1 = Series(range(3),index=['c','a','e'])
ser1









    Out[99]:





c    0
a    1
e    2
dtype: int64



In [100]:

    
ser1.sort_index() # by index









    Out[100]:





a    1
c    0
e    2
dtype: int64



In [101]:

    
ser1.order() # by value









    Out[101]:





c    0
a    1
e    2
dtype: int64



In [103]:

    
ser2 = Series(randn(10))
ser2









    Out[103]:





0    0.269250
1    0.071230
2   -1.713685
3   -0.700515
4    0.698345
5   -0.751510
6    0.012405
7   -0.373998
8   -0.074376
9   -1.200380
dtype: float64



In [104]:

    
ser2.sort()
ser2.rank()









    Out[104]:





2     1
9     2
5     3
3     4
7     5
8     6
6     7
1     8
0     9
4    10
dtype: float64



In [105]:

    
ser3 = Series(randn(10))
ser3.rank()









    Out[105]:





0     8
1     4
2     7
3     1
4     9
5     3
6     5
7    10
8     6
9     2
dtype: float64



In [106]:

    
ser3.sort()
ser3.rank()









    Out[106]:





3     1
9     2
5     3
1     4
6     5
8     6
2     7
0     8
4     9
7    10
dtype: float64



In [107]:

    
# Summary Statistics
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
dframe1 = DataFrame(arr,index=['A','B'],columns=['one','two','three'])
dframe1



In [108]:

    
dframe1.sum() # just ignores null









    Out[108]:





one      1
two      5
three    4
dtype: float64



In [109]:

    
dframe1.sum(axis=1)









    Out[109]:





A    3
B    7
dtype: float64



In [110]:

    
dframe1.min()









    Out[110]:





one      1
two      2
three    4
dtype: float64



In [111]:

    
dframe1.idxmin()









    Out[111]:





one      A
two      A
three    B
dtype: object



In [112]:

    
dframe1.cumsum() # adds columns downward (col2 row2 changes here)



In [113]:

    
dframe1.describe() # Some basic descriptive statistics



In [114]:

    
from IPython.display import YouTubeVideo



In [115]:

    
YouTubeVideo('xGbpuFNR1ME') # Covariance Video









    Out[115]:



In [116]:

    
YouTubeVideo('4EXNedimDMs') # Correlation video









    Out[116]:



In [118]:

    
import pandas.io.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Adj Close']
prices.head()



In [119]:

    
volume = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Volume']



In [120]:

    
volume.head()



In [121]:

    
rets = prices.pct_change()
corr = rets.corr # Correlation of the stocks



In [122]:

    
%matplotlib inline
prices.plot()









    Out[122]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3aa86d9940>



In [124]:

    
import seaborn as sns
import matplotlib.pyplot as plt

sns.corrplot(rets,annot=False, diag_names=False)









    



/usr/lib64/python3.4/site-packages/seaborn/linearmodels.py:1283: UserWarning: The `corrplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `corrplot` function has been deprecated in favor "
/usr/lib64/python3.4/site-packages/seaborn/linearmodels.py:1349: UserWarning: The `symmatplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `symmatplot` function has been deprecated in favor "






    Out[124]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3aa15c60f0>



In [128]:

    
ser1 = Series(['w','w','x','y','z','w','a'])
ser1.unique()









    Out[128]:





array(['w', 'x', 'y', 'z', 'a'], dtype=object)



In [129]:

    
# Missing Data
data = Series(['one','two',np.nan,'four'])



In [130]:

    
data









    Out[130]:





0     one
1     two
2     NaN
3    four
dtype: object



In [131]:

    
data.isnull()









    Out[131]:





0    False
1    False
2     True
3    False
dtype: bool



In [132]:

    
data.dropna() # Drops null values









    Out[132]:





0     one
1     two
3    four
dtype: object



In [133]:

    
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe



In [134]:

    
clean_dframe = dframe.dropna()
clean_dframe



In [135]:

    
dframe.dropna(how='all') # Drops rows with all nulls



In [136]:

    
dframe.dropna(axis=1) # Drops cols with nulls



In [137]:

    
npn = np.nan
dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])
dframe2



In [138]:

    
dframe2.dropna(thresh=2)



In [139]:

    
dframe2.fillna(1)



In [140]:

    
dframe2.fillna({0:0,1:1,2:2,3:3})



In [141]:

    
dframe2



In [142]:

    
# Index Heirarchy
ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']])



In [143]:

    
ser









    Out[143]:





1  a    0.773747
   b   -0.787147
   c   -0.814421
2  a    1.011292
   b   -2.150522
   c   -1.224728
dtype: float64



In [144]:

    
ser.index









    Out[144]:





MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])



In [145]:

    
ser[:,'a']









    Out[145]:





1    0.773747
2    1.011292
dtype: float64



In [146]:

    
dframe = ser.unstack()



In [147]:

    
dframe



In [ ]:

	col1	col2	col3	col4	col5
A	0.296343	1.218577	-0.441276	0.616162	-0.532933
B	0.644505	1.593440	-0.492314	-0.251620	-0.317372
D	0.252444	1.333615	-0.597128	0.140199	1.293294
E	1.106515	-2.599082	0.865308	0.730426	0.018708
F	2.674838	-0.331867	0.644406	-0.998384	-1.249751

	col1	col2	col3	col4	col5
A	0.296343	1.218577	-0.441276	0.616162	-0.532933
B	0.644505	1.593440	-0.492314	-0.251620	-0.317372
C	NaN	NaN	NaN	NaN	NaN
D	0.252444	1.333615	-0.597128	0.140199	1.293294
E	1.106515	-2.599082	0.865308	0.730426	0.018708
F	2.674838	-0.331867	0.644406	-0.998384	-1.249751

	col1	col2	col3	col4	col5
A	0.296343	1.218577	-0.441276	0.616162	-0.532933
B	0.644505	1.593440	-0.492314	-0.251620	-0.317372
C	NaN	NaN	NaN	NaN	NaN
D	0.252444	1.333615	-0.597128	0.140199	1.293294
E	1.106515	-2.599082	0.865308	0.730426	0.018708
F	2.674838	-0.331867	0.644406	-0.998384	-1.249751

	col1	col2	col3	col4	col5	col6
A	0.296343	1.218577	-0.441276	0.616162	-0.532933	NaN
B	0.644505	1.593440	-0.492314	-0.251620	-0.317372	NaN
C	NaN	NaN	NaN	NaN	NaN	NaN
D	0.252444	1.333615	-0.597128	0.140199	1.293294	NaN
E	1.106515	-2.599082	0.865308	0.730426	0.018708	NaN
F	2.674838	-0.331867	0.644406	-0.998384	-1.249751	NaN

	one	two	three
count	1	2.000000	1
mean	1	2.500000	4
std	NaN	0.707107	NaN
min	1	2.000000	4
25%	1	2.250000	4
50%	1	2.500000	4
75%	1	2.750000	4
max	1	3.000000	4

	Rank	Team	Won	Lost	Tied*	Pct.	First Season	Total Games	Conference
0	1	Dallas Cowboys	511	378	6	0.574	1960	894	NFC East
1	2	Chicago Bears	752	563	42	0.570	1920	1357	NFC North
2	3	Green Bay Packers	741	561	37	0.567	1921	1339	NFC North
3	4	Miami Dolphins	443	345	4	0.562	1966	792	AFC East
4	5	Baltimore Ravens	182	143	1	0.560	1996	326	AFC North

	a	b	c	d	e
nyc	False	False	False	False	False
sf	False	False	False	False	False
la	False	True	True	True	True
dc	True	True	True	True	True
chi	True	True	True	True	True

	BP	CVX	XOM
Date
2010-01-04	45.569012	63.917128	58.703696
2010-01-05	45.892578	64.369872	58.932905
2010-01-06	46.131402	64.377952	59.442264
2010-01-07	46.115994	64.135417	59.255504
2010-01-08	46.223849	64.248601	59.017797

	BP	CVX	XOM
Date
2010-01-04	3956100	10173800	27809100
2010-01-05	4109600	10593700	30174700
2010-01-06	6227900	11014600	35044700
2010-01-07	4431300	9626900	27192100
2010-01-08	3786100	5624300	24891800