In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
#import scipy.io.wavfile
import scipy.ndimage as sp
#import calendar
In [2]:
h = [63, 66, 71, 65, 70, 66, 67, 65, 67, 74, 64, 75, 68, 67, 70, 73, 66, 70, 72, 62, 68,
70, 62, 69, 66, 70, 70, 68, 69, 70, 71, 65, 64, 71, 64, 78, 69, 70, 65, 66, 72, 64]
d = {}
for i in h:
d[i] = d.get(i, 0)+1
histlist = []
for i in d:
histlist.append((i, d.get(i)))
sorted(histlist)
hist0 = [i for (i,j) in histlist]
hist1 = [j for (i,j) in histlist]
In [13]:
plt.bar(hist0, hist1, width=1)
plt.title("Faculty Heights Histogram")
plt.xlabel("Height")
plt.xticks(np.arange(78-62+2)+62)
plt.ylabel("Frequency")
fig = plt.gcf()
In [5]:
heights_mean = sum(h)/len(h)
heights_stdDev = sqrt(sum([(heights_mean-i)**2 for i in h])//len(h))
heights_mean_auto = np.mean(h)
heights_stdDev_auto = np.std(h)
print [heights_mean, heights_mean_auto] #these are equal
print [heights_stdDev, heights_stdDev_auto] #these are equal
In [6]:
#not used any more - panda data frames are easier
stringData = []
with open('./stateData.csv','rb') as csvfile:
stateData = csv.reader(csvfile, delimiter=' ', quotechar='|')
for line in stateData:
stringData.append(line)
data = []
for j in range(len(stringData)-1):
data.append([i for i in stringData[j][0].split(',')])
In [7]:
pd.read_csv('./stateData.csv')
Out[7]:
In [8]:
#This was easier than calculating it by hand, and more useful to me.
def findCorrelation(df, test1, test2):
mean1 = df[test1].mean()
mean2 = df[test2].mean()
r=0
elements = len(df[test1])
for i in range(elements):
r+=((df[test1][i])-mean1)*((df[test2][i])-mean2)
rxy = r/elements/df[test1].std()/df[test2].std()
return rxy
data = pd.read_csv('./stateData.csv')
findCorrelation(data,'University','Income')
Out[8]:
In [9]:
#Create three column vector of (data-mean)/stdDev for 3 given categories in dataframe
row_vectors = []
for i in ['University','Income','Infant Mort']:
l = data[i]
l_vector = (l-l.mean())/l.std()
row_vectors.append(l_vector)
col_vectors = np.transpose(row_vectors)
print col_vectors
row_vectors = np.transpose(col_vectors)
print row_vectors
In [20]:
correlation = sp.imread('./correlation.png')
fig = plt.figure(figsize=(15,15))
plt.grid(False)
plt.imshow(correlation)
Out[20]:
In [10]:
t = [53,54,58,66,69,70,71,73,81]
c = [19,26,21,33,31,36,36,38,45]
xi = sum(t)
yi = sum(c)
xsqr = sum([i**2 for i in t])
xiyi = sum([t[i]*c[i] for i in range(len(t))])
n = len(t)
print xi
print yi
print xsqr
print xiyi
print n
In [11]:
a = [[xsqr, xi],[xi,n]]
a_inv = np.linalg.inv(a)
v = [[xiyi],[yi]]
[[a],[b]] = np.dot(a_inv,v) #find a and b
#find two points on line of best fit for plotting
y1 = a*t[0]+b
y2 = a*t[n-1]+b
In [12]:
plt.plot(t,c)
plt.plot([t[0], t[n-1]],[y1, y2]) #line of best fit
plt.show()
In [ ]: