In [1]:
# Pandas
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
In [2]:
obj = Series([3,6,9,12])
obj
Out[2]:
In [3]:
obj.values
Out[3]:
In [4]:
obj.index
Out[4]:
In [6]:
ww2_cas = Series([8700000,4300000,3000000,2100000,400000], index=['USSR','Germany','China','Japan','USA'])
ww2_cas
Out[6]:
In [7]:
ww2_cas['USA']
Out[7]:
In [8]:
## Check which countries had casualties greater than 4 mil
ww2_cas[ww2_cas > 4000000]
Out[8]:
In [9]:
'USSR' in ww2_cas
Out[9]:
In [10]:
ww2_dict = ww2_cas.to_dict()
ww2_dict
Out[10]:
In [11]:
ww2_series = Series(ww2_dict)
ww2_series
Out[11]:
In [14]:
countries = ['China','germany','Japan','USA','USSR']
obj2 = Series(ww2_dict, index = countries)
obj2
Out[14]:
In [15]:
pd.isnull(obj2)
Out[15]:
In [16]:
pd.notnull(obj2)
Out[16]:
In [17]:
ww2_series + obj2
Out[17]:
In [18]:
obj2.name = "World War 2 Casualties"
obj2
Out[18]:
In [21]:
obj2.index.name = 'countries'
obj2
Out[21]:
In [22]:
# DataFrames
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)
Out[22]:
In [23]:
nfl_frame = pd.read_clipboard()
In [24]:
nfl_frame
Out[24]:
In [25]:
nfl_frame.columns
Out[25]:
In [29]:
nfl_frame['Rank ']
Out[29]:
In [30]:
DataFrame(nfl_frame, columns=['Team ','First Season ', 'Stadium'])
Out[30]:
In [31]:
nfl_frame.head(2)
Out[31]:
In [32]:
nfl_frame.ix[3]
Out[32]:
In [33]:
nfl_frame['Stadium'] = "Levi's Stadium"
nfl_frame
Out[33]:
In [35]:
nfl_frame['Stadium'] = np.arange(5)
stadiums = Series(["Levi's Stadium", "ATT Stadium"], index=[4,0])
stadiums
Out[35]:
In [36]:
nfl_frame['Stadium'] = stadiums
nfl_frame
Out[36]:
In [37]:
del nfl_frame['Stadium']
nfl_frame
Out[37]:
In [38]:
data = {'City':['SF','LA','NYC'], 'Population':[837000,3880000,8400000]}
In [39]:
city_frame = DataFrame(data)
city_frame
Out[39]:
In [40]:
# Index Objects
my_ser = Series([1,2,3,4], index=['A','B','C','D'])
my_ser
Out[40]:
In [42]:
my_index = my_ser.index
my_index[2]
Out[42]:
In [43]:
my_index[2:]
Out[43]:
In [44]:
my_index[0] = 'Z'
In [45]:
# Reindexing
from numpy.random import randn
ser1 = Series([1,2,3,4], index=['A','B','C','D'])
ser1
Out[45]:
In [46]:
ser2 = ser1.reindex(['A','B','C','D','E','F'])
ser2
Out[46]:
In [48]:
ser2 = ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)
ser2
Out[48]:
In [49]:
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])
In [50]:
ser3
Out[50]:
In [51]:
ranger = range(15)
print(ranger)
ser3.reindex(ranger, method='ffill')
Out[51]:
In [54]:
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])
dframe
Out[54]:
In [58]:
dframe2 = dframe.reindex(['A','B','C','D','E','F'])
dframe2
Out[58]:
In [62]:
new_columns= ['col1','col2','col3','col4','col5','col6']
dframe2.reindex(columns=new_columns)
dframe2
Out[62]:
In [63]:
dframe.ix[['A','B','C','D','E','F'], new_columns]
Out[63]:
In [64]:
# Drop Entry
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1
Out[64]:
In [65]:
ser1.drop('b')
Out[65]:
In [66]:
dframe1 = DataFrame(np.arange(9).reshape(3,3),index=['sf','la','ny'],columns=['pop','size','year'])
dframe1
Out[66]:
In [70]:
dframe2 = dframe1.drop('la')
print(dframe1)
dframe2
Out[70]:
In [71]:
dframe1.drop('year',axis=1)
Out[71]:
In [72]:
# Selecting Entries
ser1 = Series(np.arange(3),index=['a','b','c'])
ser1 = 2*ser1
ser1
Out[72]:
In [74]:
ser1['b']
Out[74]:
In [75]:
ser1[1]
Out[75]:
In [76]:
ser1[0:3]
Out[76]:
In [77]:
ser1[:]
Out[77]:
In [78]:
ser1[['a','c']]
Out[78]:
In [79]:
ser1[ser1>3]
Out[79]:
In [80]:
ser1[ser1>3] = 10
In [81]:
ser1
Out[81]:
In [82]:
dframe = DataFrame(np.arange(25).reshape((5,5)),index=['nyc','sf','la','dc','chi'],columns=['a','b','c','d','e'])
dframe
Out[82]:
In [83]:
dframe[dframe['c']>8]
Out[83]:
In [84]:
dframe > 10
Out[84]:
In [85]:
dframe.ix['la'] # ix is kinda like a transpose
Out[85]:
In [86]:
# Data Alignment
ser1 = Series([0,1,2],index=['a','b','c'])
ser1
Out[86]:
In [87]:
ser2 = Series([3,4,5,6],index=['a','b','c','d'])
ser2
Out[87]:
In [88]:
ser1+ser2
Out[88]:
In [91]:
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns=list('AB'),index=['nyc','la'])
dframe1
Out[91]:
In [92]:
dframe2 = DataFrame(np.arange(9).reshape((3,3)),columns=list('ADC'),index=['nyc','la','sf'])
dframe2
Out[92]:
In [93]:
dframe1+dframe2
Out[93]:
In [94]:
dframe1.add(dframe2,fill_value=0)
Out[94]:
In [96]:
ser3 = dframe2.ix[0]
ser3
Out[96]:
In [97]:
dframe2-ser3
Out[97]:
In [99]:
# Ranking and Sorting
ser1 = Series(range(3),index=['c','a','e'])
ser1
Out[99]:
In [100]:
ser1.sort_index() # by index
Out[100]:
In [101]:
ser1.order() # by value
Out[101]:
In [103]:
ser2 = Series(randn(10))
ser2
Out[103]:
In [104]:
ser2.sort()
ser2.rank()
Out[104]:
In [105]:
ser3 = Series(randn(10))
ser3.rank()
Out[105]:
In [106]:
ser3.sort()
ser3.rank()
Out[106]:
In [107]:
# Summary Statistics
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
dframe1 = DataFrame(arr,index=['A','B'],columns=['one','two','three'])
dframe1
Out[107]:
In [108]:
dframe1.sum() # just ignores null
Out[108]:
In [109]:
dframe1.sum(axis=1)
Out[109]:
In [110]:
dframe1.min()
Out[110]:
In [111]:
dframe1.idxmin()
Out[111]:
In [112]:
dframe1.cumsum() # adds columns downward (col2 row2 changes here)
Out[112]:
In [113]:
dframe1.describe() # Some basic descriptive statistics
Out[113]:
In [114]:
from IPython.display import YouTubeVideo
In [115]:
YouTubeVideo('xGbpuFNR1ME') # Covariance Video
Out[115]:
In [116]:
YouTubeVideo('4EXNedimDMs') # Correlation video
Out[116]:
In [118]:
import pandas.io.data as pdweb
import datetime
prices = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Adj Close']
prices.head()
Out[118]:
In [119]:
volume = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2014,1,1))['Volume']
In [120]:
volume.head()
Out[120]:
In [121]:
rets = prices.pct_change()
corr = rets.corr # Correlation of the stocks
In [122]:
%matplotlib inline
prices.plot()
Out[122]:
In [124]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.corrplot(rets,annot=False, diag_names=False)
Out[124]:
In [128]:
ser1 = Series(['w','w','x','y','z','w','a'])
ser1.unique()
Out[128]:
In [129]:
# Missing Data
data = Series(['one','two',np.nan,'four'])
In [130]:
data
Out[130]:
In [131]:
data.isnull()
Out[131]:
In [132]:
data.dropna() # Drops null values
Out[132]:
In [133]:
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe
Out[133]:
In [134]:
clean_dframe = dframe.dropna()
clean_dframe
Out[134]:
In [135]:
dframe.dropna(how='all') # Drops rows with all nulls
Out[135]:
In [136]:
dframe.dropna(axis=1) # Drops cols with nulls
Out[136]:
In [137]:
npn = np.nan
dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])
dframe2
Out[137]:
In [138]:
dframe2.dropna(thresh=2)
Out[138]:
In [139]:
dframe2.fillna(1)
Out[139]:
In [140]:
dframe2.fillna({0:0,1:1,2:2,3:3})
Out[140]:
In [141]:
dframe2
Out[141]:
In [142]:
# Index Heirarchy
ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']])
In [143]:
ser
Out[143]:
In [144]:
ser.index
Out[144]:
In [145]:
ser[:,'a']
Out[145]:
In [146]:
dframe = ser.unstack()
In [147]:
dframe
Out[147]:
In [ ]: