In [1]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
#import scipy.io.wavfile
import scipy.ndimage as sp
#import calendar

Histograms, Means, and Standard Deviations



In [2]:

    
h = [63, 66, 71, 65, 70, 66, 67, 65, 67, 74, 64, 75, 68, 67, 70, 73, 66, 70, 72, 62, 68, 
     70, 62, 69, 66, 70, 70, 68, 69, 70, 71, 65, 64, 71, 64, 78, 69, 70, 65, 66, 72, 64]

d = {}
for i in h:
    d[i] = d.get(i, 0)+1

histlist = []
for i in d:
    histlist.append((i, d.get(i)))
    
sorted(histlist)
hist0 = [i for (i,j) in histlist]
hist1 = [j for (i,j) in histlist]



In [13]:

    
plt.bar(hist0, hist1, width=1)
plt.title("Faculty Heights Histogram")
plt.xlabel("Height")
plt.xticks(np.arange(78-62+2)+62)
plt.ylabel("Frequency")

fig = plt.gcf()



In [5]:

    
heights_mean = sum(h)/len(h)
heights_stdDev = sqrt(sum([(heights_mean-i)**2 for i in h])//len(h))

heights_mean_auto = np.mean(h)
heights_stdDev_auto = np.std(h)

print [heights_mean, heights_mean_auto] #these are equal
print [heights_stdDev, heights_stdDev_auto] #these are equal









    



[477/7, 68.142857142857139]
[1/7*sqrt(1831/3), 3.5292750407024345]

Correlation



In [6]:

    
#not used any more - panda data frames are easier
stringData = []
with open('./stateData.csv','rb') as csvfile:
    stateData = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for line in stateData:
        stringData.append(line)
data = []
for j in range(len(stringData)-1):
    data.append([i for i in stringData[j][0].split(',')])



In [7]:

    
pd.read_csv('./stateData.csv')









    Out[7]:






  
    
      
      State Name
      Poverty
      Infant Mort
      White
      Crime
      Doctors
      Traf Deaths
      University
      Unemployed
      Income
    
  
  
    
      0
      Alabama
      15.7
      9.0
      71.0
      448
      218.2
      1.81
      22.0
      5.0
      42666
    
    
      1
      Alaska
      8.4
      6.9
      70.6
      661
      228.5
      1.63
      27.3
      6.7
      68460
    
    
      2
      Arizona
      14.7
      6.4
      86.5
      483
      209.7
      1.69
      25.1
      5.5
      50958
    
    
      3
      Arkansas
      17.3
      8.5
      80.8
      529
      203.4
      1.96
      18.8
      5.1
      38815
    
    
      4
      California
      13.3
      5.0
      76.6
      523
      268.7
      1.21
      29.6
      7.2
      61021
    
    
      5
      Colorado
      11.4
      5.7
      89.7
      348
      259.7
      1.14
      35.6
      4.9
      56993
    
    
      6
      Connecticut
      9.3
      6.2
      84.3
      256
      376.4
      0.86
      35.6
      5.7
      68595
    
    
      7
      Delaware
      10.0
      8.3
      74.3
      689
      250.9
      1.23
      27.5
      4.8
      57989
    
    
      8
      Florida
      13.2
      7.3
      79.8
      723
      247.9
      1.56
      25.8
      6.2
      47778
    
    
      9
      Georgia
      14.7
      8.1
      65.4
      493
      217.4
      1.46
      27.5
      6.2
      50861
    
    
      10
      Hawaii
      9.1
      5.6
      29.7
      273
      317.0
      1.33
      29.1
      3.9
      67214
    
    
      11
      Idaho
      12.6
      6.8
      94.6
      239
      168.8
      1.60
      24.0
      4.9
      47576



In [8]:

    
#This was easier than calculating it by hand, and more useful to me.

def findCorrelation(df, test1, test2):
    mean1 = df[test1].mean()
    mean2 = df[test2].mean()
    r=0
    elements = len(df[test1])
    for i in range(elements):
        r+=((df[test1][i])-mean1)*((df[test2][i])-mean2)
    rxy = r/elements/df[test1].std()/df[test2].std()
    return rxy

data = pd.read_csv('./stateData.csv')
findCorrelation(data,'University','Income')









    Out[8]:





0.69420837297396532



In [9]:

    
#Create three column vector of (data-mean)/stdDev for 3 given categories in dataframe

row_vectors = []
for i in ['University','Income','Infant Mort']:
    l = data[i]
    l_vector = (l-l.mean())/l.std()
    row_vectors.append(l_vector)

col_vectors = np.transpose(row_vectors)
print col_vectors
row_vectors = np.transpose(col_vectors)
print row_vectors









    



[[-1.08551776 -1.21397563  1.57928167]
 [-0.00509633  1.34335929 -0.06525957]
 [-0.45357315 -0.39186889 -0.45681701]
 [-1.73784768 -1.59578135  1.18772423]
 [ 0.4637658   0.60582287 -1.55317784]
 [ 1.68688441  0.20646856 -1.00499742]
 [ 1.68688441  1.3567438  -0.61343999]
 [ 0.03567429  0.30521654  1.03110125]
 [-0.31087598 -0.70714861  0.24798638]
 [ 0.03567429 -0.40148592  0.87447828]
 [ 0.36183925  1.21982516 -1.08330891]
 [-0.67781156 -0.72717581 -0.14357106]]
[[-1.08551776 -0.00509633 -0.45357315 -1.73784768  0.4637658   1.68688441
   1.68688441  0.03567429 -0.31087598  0.03567429  0.36183925 -0.67781156]
 [-1.21397563  1.34335929 -0.39186889 -1.59578135  0.60582287  0.20646856
   1.3567438   0.30521654 -0.70714861 -0.40148592  1.21982516 -0.72717581]
 [ 1.57928167 -0.06525957 -0.45681701  1.18772423 -1.55317784 -1.00499742
  -0.61343999  1.03110125  0.24798638  0.87447828 -1.08330891 -0.14357106]]



In [20]:

    
correlation = sp.imread('./correlation.png')

fig = plt.figure(figsize=(15,15))
plt.grid(False)
plt.imshow(correlation)









    Out[20]:





<matplotlib.image.AxesImage object at 0x7f9ee7adb590>

Linear Regression



In [10]:

    
t = [53,54,58,66,69,70,71,73,81]
c = [19,26,21,33,31,36,36,38,45]

xi = sum(t)
yi = sum(c)
xsqr = sum([i**2 for i in t])
xiyi = sum([t[i]*c[i] for i in range(len(t))])
n  = len(t)

print xi
print yi
print xsqr
print xiyi
print n



In [11]:

    
a = [[xsqr, xi],[xi,n]]
a_inv = np.linalg.inv(a)
v = [[xiyi],[yi]]

[[a],[b]] = np.dot(a_inv,v) #find a and b

#find two points on line of best fit for plotting
y1 = a*t[0]+b
y2 = a*t[n-1]+b



In [12]:

    
plt.plot(t,c)
plt.plot([t[0], t[n-1]],[y1, y2]) #line of best fit 
plt.show()



In [ ]:

	State Name	Poverty	Infant Mort	White	Crime	Doctors	Traf Deaths	University	Unemployed	Income
0	Alabama	15.7	9.0	71.0	448	218.2	1.81	22.0	5.0	42666
1	Alaska	8.4	6.9	70.6	661	228.5	1.63	27.3	6.7	68460
2	Arizona	14.7	6.4	86.5	483	209.7	1.69	25.1	5.5	50958
3	Arkansas	17.3	8.5	80.8	529	203.4	1.96	18.8	5.1	38815
4	California	13.3	5.0	76.6	523	268.7	1.21	29.6	7.2	61021
5	Colorado	11.4	5.7	89.7	348	259.7	1.14	35.6	4.9	56993
6	Connecticut	9.3	6.2	84.3	256	376.4	0.86	35.6	5.7	68595
7	Delaware	10.0	8.3	74.3	689	250.9	1.23	27.5	4.8	57989
8	Florida	13.2	7.3	79.8	723	247.9	1.56	25.8	6.2	47778
9	Georgia	14.7	8.1	65.4	493	217.4	1.46	27.5	6.2	50861
10	Hawaii	9.1	5.6	29.7	273	317.0	1.33	29.1	3.9	67214
11	Idaho	12.6	6.8	94.6	239	168.8	1.60	24.0	4.9	47576