I wrote this notebook while figuring out what tools Python offers to solve data science problems i.e. those kind of programming task where spreadsheet like data has to be handled and analyzed with a statistical set of mind. Consequently the following code snippets are mainly using packages like numpy, pandas and matplotlib. It comprises the topics:
Acknowledgment: Lots of the code down below is inspired by the very helpfull book "Python for Data Analysis" by Wes McKinney published at O'Reilly.
(c) Florian Hoppe 2013
In [4]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rand
from datetime import datetime, timedelta
import dateutil
import misc as hlp
import scipy
In [4]:
%pylab inline
In [101]:
rand.randint(0,2,size=(5,2))
Out[101]:
In [102]:
rand.normal(0, 1, size=(10,2))
Out[102]:
In [110]:
random_data = np.concatenate([rand.normal(0,.5,(100,2)),rand.normal(5,1,(200,2))])
plt.scatter(random_data[:,0],random_data[:,1])
Out[110]:
In [33]:
np.random.randn(2,3) # normal distribution
np.random.rand(2,3) # uniform distribution
Out[33]:
In [28]:
np.random.permutation(10)
Out[28]:
In [120]:
t = np.arange(5)
np.random.shuffle(t)
t
Out[120]:
In [2]:
n = 10000
v1 = rand.normal(0,.5,n)
v2 = rand.normal(10,.75,n)
v3 = v1*3+10
v4 = rand.uniform(-10,10,n)
v5 = v1*v2
v6 = v1+v2+v3
mat = np.concatenate([v1,v2,v3,v4,v5,v6]).reshape(6,n).T
mat.shape
Out[2]:
In [59]:
v1.std()==v1.var()**.5
Out[59]:
In [3]:
np.cov(mat.T)
Out[3]:
In [4]:
np.corrcoef(mat.T)
Out[4]:
In [5]:
df = DataFrame({'v1':v1,'v2':v2,'v3':v3,'v4':v4,'v5':v5,'v6':v6})
df[:3]
Out[5]:
In [77]:
df.cov()
Out[77]:
In [6]:
df.cov()
# corr is standardized cov: each cov value has to be devided by the product of the std dev of the correlated variables
# df.cov().div(df.std()[0]*df.std()[1])
Out[6]:
In [26]:
df.corr()
Out[26]:
In [8]:
U,S,V = np.linalg.svd(df)
In [16]:
Series(S)
Out[16]:
In [14]:
V = DataFrame(V)
V[V<0.001]=0
V
Out[14]:
In [27]:
df.corr()
Out[27]:
In [29]:
stddf = df.sub(df.mean()).describe()
print(stddf.cov())
In [30]:
latent, coeff = linalg.eig(stddf.cov())
In [35]:
DataFrame(coeff)
Out[35]:
For sparse matrices:
In [6]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rand
from datetime import datetime, timedelta
import dateutil
from scipy.stats import pareto
import scipy.stats
import random as rnd
In [29]:
vals = np.array([1,2,3],dtype=float64)
rowi = np.array([1,5,10])
coli = np.array([5,13,100])
sparseM = scipy.sparse.coo_matrix((vals,(rowi,coli)),shape=(max(rowi)+1,max(coli)+1))
In [40]:
sparseM.shape
Out[40]:
In [31]:
(U,d,V) = scipy.sparse.linalg.svds(sparseM) # use parameter 'k=' to limit number of computed singular vectors
In [39]:
U.shape
Out[39]:
In [35]:
d.shape
Out[35]:
In [34]:
V.shape
Out[34]:
In [ ]:
sparseM == U.dot(diag(d)).dot(V.transpose())
In [195]:
# false tolerant value retieval:
dict = {'a':1,'b':2}
dict.get('t',-1) # return -1 if key is not in the dictionary
Out[195]:
In [2]:
x = np.array(range(3))
print(x.dot(x))
x * x # elementwise *
Out[2]:
In [8]:
x = np.array([0, 1])
y = np.array([2, 3])
x * y
In [2]:
e = np.identity(3)
e
Out[2]:
In [3]:
m = np.arange(6).reshape(3,2)
m
Out[3]:
In [133]:
m[0,:]
Out[133]:
In [128]:
m[:,:1]
Out[128]:
In [7]:
# elementwise multiplication
np.multiply(e,np.arange(9).reshape((3,3))) # == e * np.arange(9).reshape((3,3))
Out[7]:
In [125]:
# real matrix multiplication
np.dot(e,m)
Out[125]:
In [130]:
E = np.matrix(e)
M = np.matrix(m)
E*M
Out[130]:
In [132]:
M[0,:]
Out[132]:
In [131]:
M[:,0]
Out[131]:
Use sparse Matrices for big tables:
In [ ]:
sparseM = scipy.sparse.coo_matrix((values),(row_indices,column_indices)),shape=(max(row_indices)+1,max(column_indices)+1))
In [10]:
arr_1d = np.random.normal(size=3)
arr_1d
Out[10]:
In [11]:
arr_1d[:, np.newaxis]
Out[11]:
In [9]:
arr_1d[np.newaxis, :]
Out[9]:
In [12]:
arr = randn(3, 4, 5)
arr.shape
Out[12]:
In [17]:
depth_means = arr.mean(2)
print(depth_means.shape)
print(depth_means[:, :, np.newaxis].shape)
In [15]:
demean = arr - depth_means[:, :, np.newaxis]
demean.mean(2)
Out[15]:
In [22]:
arr = np.zeros((4, 3))
arr[:2] = [[-1.37], [0.509]]
arr
Out[22]:
In [24]:
arr = np.zeros((4, 3))
arr[:,:2] = [-1.37, 0.509]
arr
Out[24]:
In [6]:
ser = Series([1,4,"three",5.])
print(ser.values)
In [9]:
ser = Series([1,4,"three",5.], index=['dim1','dim2','name','val'])
print(ser.index)
In [18]:
print(ser['val'])
print(ser[(ser == 5)|(ser == 1)])
In [19]:
ser2 = Series({'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000})
print(ser2)
In [21]:
ser3 = Series({'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}, index = ['Texas','Ohio','Berlin','Oregon','Utah'])
print(ser3)
In [22]:
ser3.index = ['T','O','B','OR','U']
print(ser3)
In [80]:
ser4 = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(ser4)
print(ser4.reindex(['a','aa','b','c','d','e'],fill_value=99))
In [81]:
ser5 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
ser5.reindex(range(6), method='ffill')
Out[81]:
In [75]:
ser4
Out[75]:
In [45]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002],'popp': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data, columns=['year', 'state', 'popp', 'debt'], index=['one', 'two', 'three', 'four', 'five'])
df
Out[45]:
In [49]:
df.ix['four'] # == df.ix[3]
Out[49]:
In [71]:
print(df.ix['one','state'])
df.ix[['one','two'],['state','popp']]
Out[71]:
In [48]:
df['popp'] == df.popp
Out[48]:
In [51]:
df['newcol'] = np.arange(5)
df
Out[51]:
In [53]:
del df['debt']
df
Out[53]:
In [85]:
df.T.drop(['year','newcol'])
Out[85]:
In [83]:
df.T.drop('one',axis=1)
Out[83]:
In [56]:
df.values
Out[56]:
In [67]:
2.4 in df.values
Out[67]:
In [91]:
df.T.ix['year':'popp'] # last element ie 'popp' is included
Out[91]:
In [96]:
df[df.popp > 2]
Out[96]:
In [100]:
df[df.popp>2].newcol.sum()
Out[100]:
In [5]:
dn = DataFrame(np.arange(12).reshape(4,3),columns=['col2','col1','col3'],index=['1','2','4','3'])
dn
Out[5]:
In [6]:
func_on_series = lambda x:x.max()
dn.apply(func_on_series)
Out[6]:
In [7]:
dn.apply(func_on_series,axis=1)
Out[7]:
In [8]:
dn.applymap(lambda x:x+1) # == dn + 1
Out[8]:
In [9]:
dn.sort('col1',ascending=0)
Out[9]:
In [10]:
dn.sort(['col1','col2'],ascending=[0,1])
Out[10]:
In [121]:
dn.sort_index(ascending=False).sort_index(axis=1)
Out[121]:
In [124]:
dn.col1.order()
Out[124]:
In [133]:
dn.col2 = [34,34,3,99]
dn
Out[133]:
In [134]:
dn.sort_index(by=['col2','col1'],ascending=False)
Out[134]:
In [142]:
dn2 = dn.reindex([str(x) for x in [1,2,4,3,4]])
dn2
Out[142]:
In [144]:
dn2.ix['4']
Out[144]:
In [21]:
dn
Out[21]:
In [22]:
dn.idxmax()
Out[22]:
In [153]:
dn2.count()
Out[153]:
In [23]:
Series([1,2,np.NAN,2]).count()
Out[23]:
In [24]:
dn[dn==4] = np.nan
dn
Out[24]:
In [25]:
dn.col1.isnull()
Out[25]:
In [28]:
np.where(dn.col1.isnull(),-100,dn.col1)
Out[28]:
In [162]:
type(12.8)
Out[162]:
In [15]:
data = DataFrame([[1., 6.5, 3.], [1., np.NAN, np.NAN],[np.NAN, np.NAN, np.NAN], [np.NAN, 6.5, 3.]])
data
Out[15]:
In [17]:
data.fillna(method='bfill') # oder ffill
Out[17]:
In [10]:
data.dropna(how='all')
Out[10]:
In [14]:
data.fillna(0, inplace=True) # inplace=True modifiziert die Originaldaten
data
Out[14]:
In [141]:
data = DataFrame({'vals':rand.normal(0,1,size=10),'weights':rand.randint(2,size=10)})
data['results']=data.vals*data.weights
data
Out[141]:
In [142]:
data.vals.corr(data.results)
Out[142]:
In [ ]:
pd.read_csv('some.csv', names=['a', 'b', 'c', 'd', 'message'], header=None) # otherwise first line will define column names
pd.read_table('some.txt', sep='\t', index_col='message' ) # column messages will become index of table
pd.read_csv('some.csv', na_values = {'message':'xxx','a':'NULL'}, converters={'message':lambda x:x[0:1]}) # for each column it can be defined what entry represents a NaN and how it should be parsed by a custom function
In [ ]:
data.to_csv(sys.stdout, index=False, header=False, sep='|') # write
In [ ]:
# JSON: read in lists of dicts to get rows where keys become columns
tweets = DataFrame(list_of_dicts, columns=list_of_dicts[0].keys())
data.save('binary.data') # pandas only format
data = pd.load('binary.data')
store = pd.HDFStore('mydata.h5') # common scientific format HDF5 good for: write once, read many
In [ ]:
from pandas.io.parsers import TextParser
from lxml.html import parse from urllib2 import urlopen
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
# create DataFrame from table tag in html file:
def parse_options_data( doc ):
tables = doc.findall('.//table') # select table
table = tables[0]
rows = table.findall('.//tr') # select rows of table
header = _unpack(rows[0], kind='th') # extract header row
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk() # create DataFrame
In [ ]:
import pandas.io.sql as sql
import sqlite3
con = sqlite3.connect(':myDB:')
sql.read_frame('select * from test', con)
In [ ]:
# a.combine_first(b)
# a's values prioritized, use values from b to fill holes
# data1.update(data2, overwrite=False)
In [ ]:
pd.merge(df1, df2, on='key', how='left') # use a list to join with multiple keys, 'inner' is default, 'right', 'outer' are the other option of the how parameter
df1.join(df2, on='key') # same but in a neat way
df1.join([df2,df3]) # join three DataFrames
pd.merge(left1, right1, left_on='key', right_index=True) # joins tables by using column 'key' of left table to match with index of right table
left1.join(right1) # same but in a neat way
In [7]:
arr = np.arange(12).reshape((3, 4))
print(arr)
print(np.concatenate([arr,arr],axis=1))
print(np.concatenate([arr,arr]))
In [39]:
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4, 5], index=['c', 'd', 'e', 'b'])
def psdf(data):
print("/////////////////\n%s" % data)
psdf(pd.concat([s2, s1]))
psdf(pd.concat([s1, s2],axis=1))
psdf(pd.concat([s1, s2],axis=1,join='inner'))
In [31]:
# rotating between row and column indices
data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number'))
psdf(data)
psdf(data.stack())
psdf(data.stack().unstack())
In [56]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
psdf(data)
psdf(data.drop_duplicates())
In [60]:
data['k3'] = range(7)
psdf(data)
psdf(data.replace([1,2],[111,222]))
psdf(data.replace({1:111,2:222})) # should do the same but is not working
In [47]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
Out[47]:
In [50]:
cats.labels
Out[50]:
In [51]:
cats.levels
Out[51]:
In [53]:
pd.value_counts(cats)
Out[53]:
In [54]:
cats = pd.cut(ages,4) # create equally sized bins
print(cats.levels)
pd.value_counts(cats)
Out[54]:
In [55]:
cats = pd.qcut(ages,4) # create bins with equilly sized number of elements
print(cats.levels)
pd.value_counts(cats)
Out[55]:
In [117]:
import re
data = Series({'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan})
data
Out[117]:
In [82]:
data.str[:2]
Out[82]:
In [83]:
data.str.join(sep=".")
Out[83]:
In [77]:
data.c1.str.replace("c",'999')
Out[77]:
In [119]:
matches = data.str.match('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})', flags=re.IGNORECASE)
matches.str[1]
Out[119]:
In [16]:
randomwalk = rand.randn(40)
fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax1.plot(randomwalk.cumsum(),'o-k')
Out[16]:
In [17]:
comp1 = np.random.normal(0, 1, size=200) # N(0, 1)
comp2 = np.random.normal(10, 2, size=200) # N(10, 4)
values = Series(np.concatenate([comp1, comp2]))
values.hist(bins=100, alpha=0.3, color='k', normed=True)
values.plot(kind='kde', style='k--')
Out[17]:
In [18]:
# making some histogram:
s = Series(np.random.randn(100000))
hist = np.histogram(s,bins=100)
plt.bar(hist[1][:-1],hist[0],width=.1)
Out[18]:
In [154]:
tmpdata = DataFrame(np.random.normal(size=(100,3)))
tmpdata['cat'] = 'A'
data = DataFrame(np.random.normal(10,2,size=(150,3)))
data['cat']='B'
tmpdata2 = DataFrame(np.random.normal(50,2,size=(150,3)))
tmpdata2['cat']='C'
data = pd.concat([tmpdata,data,tmpdata2])
del tmpdata
data[:4]
Out[154]:
In [19]:
hlp.plot_categorical_data(data,[0,1])
Out[19]:
In [20]:
fig, ax = plt.subplots()
ax.scatter(data[data.cat=='A'][0],data[data.cat=='A'][1],marker='.',facecolor='b')
ax.scatter(data[data.cat=='B'][0],data[data.cat=='B'][1],marker='x',facecolor='r')
plt.show()
In [21]:
pd.scatter_matrix(data)
Out[21]:
In [18]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a','a'],'key2' : ['one', 'two', 'one', 'two', 'one','three'],'data1' : np.random.randn(6),'data2' : np.random.randn(6)})
df
Out[18]:
In [35]:
df.groupby('key1').min()
Out[35]:
In [22]:
for name,group in df.groupby('key2'):
print("Group %s:" % name)
print(group)
In [21]:
pieces = dict(list(df.groupby('key1')))
pieces['a']
Out[21]:
In [37]:
df.groupby('key2')['data1'].count() # == ... .size()
Out[37]:
In [29]:
people = DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people
Out[29]:
In [30]:
# Grouping by groups of keys:
people.groupby({'a':'aa','b':'bb','c':'aa','d':'aa','e':'bb'},axis=1).sum()
Out[30]:
In [40]:
# Grouping by function:
people.groupby(lambda idx:idx[0]).first()
Out[40]:
In [34]:
# call multiple and customized aggregation functions:
df.groupby('key2').agg([('Mittel','mean'),'sum',('User function',(lambda arr: min(arr)))])
Out[34]:
In [44]:
# call different aggregation functions on different columns:
df.groupby('key2').agg({'data1':[min,max],'data2':sum})
Out[44]:
In [53]:
df.groupby('key1').transform(abs)
Out[53]:
In [54]:
def demean(arr):
return arr - arr.mean()
df.groupby('key1').transform(demean)
Out[54]:
In [55]:
df.groupby('key1').transform(demean).mean()
Out[55]:
In [56]:
df.groupby('key1').apply(lambda dataframe:dataframe.describe())
Out[56]:
In [58]:
frame = DataFrame({'data1': np.random.randn(1000),'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]
Out[58]:
In [67]:
grouped = frame.data2.groupby(factor)
grouped.min()
Out[67]:
In [80]:
def get_stats(group,some_parameter):
return {'min': group.min() + some_parameter, 'max': group.max(),'count': group.count(), 'mean': group.mean()}
grouped.apply(get_stats, some_parameter=100).unstack()
Out[80]:
In [184]:
tips = pd.read_csv('ch08/tips.csv')
tips[:10]
Out[184]:
In [163]:
tips.pivot_table(rows=['sex', 'smoker']) # == tips.groupby(['sex','smoker']).mean()
Out[163]:
In [177]:
tips.pivot_table(['tip','total_bill'],rows=['day','smoker'],cols='sex')
Out[177]:
In [183]:
tips.pivot_table(['tip','total_bill'],rows=['day','smoker'],cols='sex',aggfunc=sum,margins=True)
Out[183]:
In [193]:
# counting how often does a certain pair of values occures:
ser1 = Series(rand.randint(0,2,10))
ser2 = Series(rand.randint(10,12,10))
ser3 = Series(rand.randint(20,22,10))
dt = DataFrame([ser1,ser2,ser3])
print(dt)
pd.crosstab(ser1,ser2,margins=True)
Out[193]:
In [194]:
pd.crosstab([ser1,ser2],ser3,margins=True)
Out[194]:
In [42]:
now = datetime.now()
now.year, now.month, now.day, now.hour, now.minute
Out[42]:
In [51]:
delta = now-datetime.now()
delta.days
Out[51]:
In [53]:
now.strftime('%d.%m.%Y')
Out[53]:
In [56]:
datetime.strptime("2012-12-15", '%Y-%m-%d')
Out[56]:
In [67]:
dateutil.parser.parse("07:56:59 / 12. Sep 1976", dayfirst=True)
Out[67]:
In [72]:
# parse arrays of date strings with pandas:
pd.to_datetime(['7/6/2011',None])
Out[72]:
In [103]:
ts = Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts[:5]
Out[103]:
In [104]:
ts['2000-02']
Out[104]:
In [105]:
ts['2000-01-05':'2000-01-08']
Out[105]:
In [ ]:
# use ts.between_time to select between hours of a day
In [84]:
pd.date_range('1/1/2000',periods=2,freq="D")
Out[84]:
In [91]:
pd.date_range('1/1/2000',periods=3,freq="WOM-3FRI") # every Friday of the 3rd week of a month ie lots of options!
Out[91]:
In [90]:
ts[:'2000-01-2'].resample('H')
Out[90]:
In [92]:
# creating info about (percental) change of data points use something like: ts / ts.shift(1) - 1
In [98]:
import pytz
ts = pd.date_range('3/9/2012 9:30', periods=3, freq='D').tz_localize('CET') # == pd.date_range('3/9/2012 9:30', periods=3, freq='D', tz='CET')
ts
Out[98]:
In [100]:
ts.tz_convert('US/Eastern')
Out[100]:
In [101]:
ts.tz_convert('Asia/Shanghai')
Out[101]:
In [102]:
# periods are time spans:
p = pd.Period(2007, freq='A-DEC') # ie. (1.1.2007 - 31.12.2007)
p + 1 # ie. (1.1.2008 - 31.12.2008)
Out[102]:
In [22]:
import pandas.io.data as web
data = web.get_data_yahoo('GOOG', '2013-01-01')
In [23]:
data[-10:]
Out[23]:
In [ ]: