In [2]:
import numpy as np
import matplotlib.pylab as py
import pandas as pa
import scipy.stats as st
np.set_printoptions(precision=2)
%matplotlib inline
In this section we show a few example of discrete random variables using Python.
The documentation for these routines can be found at:
In [3]:
X=st.bernoulli(p=0.3)
X.rvs(100)
Out[3]:
In [4]:
# Note that "high" is not included.
X=st.randint(low=1,high=5)
X.rvs(100)
Out[4]:
The documentation for these routines can be found at:
In [5]:
XUniform=st.uniform(loc=0.7,scale=0.3);
# "bins" tells you how many bars to use
# "normed" says to turn the counts into probability densities
py.hist(XUniform.rvs(1000000),bins=20,normed=True);
x = np.linspace(-0.1,1.1,100)
py.plot(x,XUniform.pdf(x))
#py.savefig('Figures/uniformPDF.png')
Out[5]:
In [7]:
py.plot(XUniform.cdf(x))
#py.savefig('Figures/uniformCDF.png')
Out[7]:
In [8]:
XNormal=st.norm(loc=0,scale=1);
# "bins" tells you how many bars to use
# "normed" says to turn the counts into probability densities
py.hist(XNormal.rvs(1000),bins=100,normed=True);
x = np.linspace(-3,3,100)
py.plot(x,XNormal.pdf(x))
#py.savefig('Figures/normalPDF.png')
Out[8]:
In [11]:
py.plot(XNormal.cdf(x))
#py.savefig('Figures/normalCDF.png')
Out[11]:
Now we can look at the histograms of some of our data from Case Study 2.
In [14]:
data = pa.read_hdf('data.h5','movies')
In [15]:
data
Out[15]:
In [16]:
data['title'][100000]
Out[16]:
In [17]:
X=data.pivot_table('rating',index='timestamp',aggfunc='count')
In [18]:
X.plot()
Out[18]:
In [19]:
# Warning: Some versions of Pandas use "index" and "columns", some use "rows" and "cols"
X=data.pivot_table('rating',index='title',aggfunc='sum')
#X=data.pivot_table('rating',rows='title',aggfunc='sum')
In [20]:
X
Out[20]:
In [21]:
X.hist()
Out[21]:
In [22]:
# Warning: Some versions of Pandas use "index" and "columns", some use "rows" and "cols"
X=data.pivot_table('rating',index='occupation',aggfunc='sum')
#X=data.pivot_table('rating',rows='occupation',aggfunc='sum')
In [23]:
X
Out[23]:
Here we show an example of the central limit theorem. You can play around with "numberOfDistributions" and "numberOfSamples" to see how quickly this converges to something that looks Gaussian.
In [24]:
numberOfDistributions = 100
numberOfSamples = 1000
XTest = st.uniform(loc=0,scale=1);
# The same thing works with many distributions.
#XTest = st.lognorm(s=1.0);
XCLT=np.zeros([numberOfSamples])
for i in range(numberOfSamples):
for j in range(numberOfDistributions):
XCLT[i] += XTest.rvs()
XCLT[i] = XCLT[i]/numberOfDistributions
In [25]:
py.hist(XCLT,normed=True)
Out[25]:
Some basic ideas in Linear Algebra and how you can use them in Python.
In [26]:
import numpy as np
In [27]:
a=np.array([1,2,3])
a
Out[27]:
In [28]:
A=np.matrix(np.random.randint(1,10,size=[3,3]))
A
Out[28]:
In [29]:
x=np.matrix([[1],[2],[3]])
print x
print x.T
In [30]:
a*a
Out[30]:
In [31]:
np.dot(a,a)
Out[31]:
In [32]:
x.T*x
Out[32]:
In [33]:
A*x
Out[33]:
In [34]:
b = np.matrix([[5],[6],[7]])
b
Out[34]:
In [35]:
Ai = np.linalg.inv(A)
print A
print Ai
In [36]:
A*Ai
Out[36]:
In [37]:
Ai*A
Out[37]:
In [38]:
xHat = Ai*b
xHat
Out[38]:
In [39]:
print A*xHat
print b
In [40]:
sizes = range(100,1000,200)
times = np.zeros(len(sizes))
for i in range(len(sizes)):
A = np.random.random(size=[sizes[i],sizes[i]])
x = %timeit -o np.linalg.inv(A)
times[i] = x.best
py.plot(sizes,times)
Out[40]:
Sparse matrices (those with lots of 0s) can often be worked with much more efficiently than general matrices than standard methods.
In [41]:
from scipy.sparse.linalg import spsolve
from scipy.sparse import rand,eye
mySize = 1000
A=rand(mySize,mySize,0.001)+eye(mySize)
b=np.random.random(size=[mySize])
The sparsity structure of A.
In [42]:
py.spy(A,markersize=0.1)
Out[42]:
In [43]:
dense = %timeit -o np.linalg.solve(A.todense(),b)
In [44]:
sparse = %timeit -o spsolve(A,b)
In [45]:
dense.best/sparse.best
Out[45]:
Pandas provides many routines for computing statistics.
In [59]:
XNormal=st.norm(loc=0.7,scale=2);
x = XNormal.rvs(1000)
print np.mean(x)
print np.std(x)
print np.var(x)
But empirical measures are not always good approximations of the true properties of the distribution.
In [62]:
sizes = np.arange(16)+1
errors = np.zeros(16)
for i in range(16):
x = XNormal.rvs(2**i)
errors[i] = np.abs(0.7-np.mean(x))
py.plot(sizes,errors)
py.plot(sizes,2/np.sqrt(sizes))
py.plot(sizes,2*2/np.sqrt(sizes),'r')
#py.savefig('Figures/errorInMean.png')
Out[62]:
In [63]:
data.pivot_table?
In [64]:
X=data.pivot_table('rating',index='title',aggfunc='mean')
#X=data.pivot_table('rating',rows='title',aggfunc='mean')
In [65]:
hist(X)
In [66]:
X=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
#X=data.pivot_table('rating',rows='title',cols='gender',aggfunc='mean')
In [67]:
py.subplot(1,2,1)
X['M'].hist()
py.subplot(1,2,2)
X['F'].hist()
Out[67]:
In [68]:
py.plot(X['M'],X['F'],'.')
Out[68]:
In [55]:
X.cov()
Out[55]:
In [56]:
X.corr()
Out[56]:
In [57]:
X=data.pivot_table('rating',index='occupation',columns='gender',aggfunc='mean')
#X=data.pivot_table('rating',rows='occupation',cols='gender',aggfunc='mean')
In [ ]:
X
In [ ]: