notebook.community

Edit and run



In [7]:

    
import pandas as pd

import platform
print 'python', platform.python_version()
print 'numpy', np.version.version
print 'pandas', pd.__version__

np.set_printoptions(linewidth =150)









    



2.7.3
numpy 1.7.1
pandas 0.13.0

The pandas R interface does not work yet

http://pandas.pydata.org/pandas-docs/dev/r_interface.html



In [10]:

    
import pandas.rpy.common as com
bos = com.load_data('Boston')
bos.head()









    Out[10]:






  
    
      
      X.crim.zn.indus.chas.nox.rm.age.dis.rad.tax.ptratio.black.lstat.medv
    
  
  
    
      1
       1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,29...
    
    
      2
       2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,2...
    
    
      3
       3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,2...
    
    
      4
       4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,2...
    
    
      5
       5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,2...
    
  

5 rows × 1 columns

Boston.csv was saved from RStudio

library(MASS)
write.csv("Boston.csv", x=Boston)

We now read it back into a pandas DataFrame



In [11]:

    
boston = pd.read_csv('data/Boston.csv')



In [12]:

    
boston.head()









    Out[12]:






  
    
      
      Unnamed: 0
      crim
      zn
      indus
      chas
      nox
      rm
      age
      dis
      rad
      tax
      ptratio
      black
      lstat
      medv
    
  
  
    
      0
       1
       0.00632
       18
       2.31
       0
       0.538
       6.575
       65.2
       4.0900
       1
       296
       15.3
       396.90
       4.98
       24.0
    
    
      1
       2
       0.02731
        0
       7.07
       0
       0.469
       6.421
       78.9
       4.9671
       2
       242
       17.8
       396.90
       9.14
       21.6
    
    
      2
       3
       0.02729
        0
       7.07
       0
       0.469
       7.185
       61.1
       4.9671
       2
       242
       17.8
       392.83
       4.03
       34.7
    
    
      3
       4
       0.03237
        0
       2.18
       0
       0.458
       6.998
       45.8
       6.0622
       3
       222
       18.7
       394.63
       2.94
       33.4
    
    
      4
       5
       0.06905
        0
       2.18
       0
       0.458
       7.147
       54.2
       6.0622
       3
       222
       18.7
       396.90
       5.33
       36.2
    
  

5 rows × 15 columns



In [13]:

    
boston.columns









    Out[13]:





Index([u'Unnamed: 0', u'crim', u'zn', u'indus', u'chas', u'nox', u'rm', u'age', u'dis', u'rad', u'tax', u'ptratio', u'black', u'lstat', u'medv'], dtype='object')



In [14]:

    
del boston['Unnamed: 0']
boston.head()









    Out[14]:






  
    
      
      crim
      zn
      indus
      chas
      nox
      rm
      age
      dis
      rad
      tax
      ptratio
      black
      lstat
      medv
    
  
  
    
      0
       0.00632
       18
       2.31
       0
       0.538
       6.575
       65.2
       4.0900
       1
       296
       15.3
       396.90
       4.98
       24.0
    
    
      1
       0.02731
        0
       7.07
       0
       0.469
       6.421
       78.9
       4.9671
       2
       242
       17.8
       396.90
       9.14
       21.6
    
    
      2
       0.02729
        0
       7.07
       0
       0.469
       7.185
       61.1
       4.9671
       2
       242
       17.8
       392.83
       4.03
       34.7
    
    
      3
       0.03237
        0
       2.18
       0
       0.458
       6.998
       45.8
       6.0622
       3
       222
       18.7
       394.63
       2.94
       33.4
    
    
      4
       0.06905
        0
       2.18
       0
       0.458
       7.147
       54.2
       6.0622
       3
       222
       18.7
       396.90
       5.33
       36.2
    
  

5 rows × 14 columns



In [15]:

    
boston.describe()









    Out[15]:






  
    
      
      crim
      zn
      indus
      chas
      nox
      rm
      age
      dis
      rad
      tax
      ptratio
      black
      lstat
      medv
    
  
  
    
      count
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
       506.000000
    
    
      mean
         3.613524
        11.363636
        11.136779
         0.069170
         0.554695
         6.284634
        68.574901
         3.795043
         9.549407
       408.237154
        18.455534
       356.674032
        12.653063
        22.532806
    
    
      std
         8.601545
        23.322453
         6.860353
         0.253994
         0.115878
         0.702617
        28.148861
         2.105710
         8.707259
       168.537116
         2.164946
        91.294864
         7.141062
         9.197104
    
    
      min
         0.006320
         0.000000
         0.460000
         0.000000
         0.385000
         3.561000
         2.900000
         1.129600
         1.000000
       187.000000
        12.600000
         0.320000
         1.730000
         5.000000
    
    
      25%
         0.082045
         0.000000
         5.190000
         0.000000
         0.449000
         5.885500
        45.025000
         2.100175
         4.000000
       279.000000
        17.400000
       375.377500
         6.950000
        17.025000
    
    
      50%
         0.256510
         0.000000
         9.690000
         0.000000
         0.538000
         6.208500
        77.500000
         3.207450
         5.000000
       330.000000
        19.050000
       391.440000
        11.360000
        21.200000
    
    
      75%
         3.677082
        12.500000
        18.100000
         0.000000
         0.624000
         6.623500
        94.075000
         5.188425
        24.000000
       666.000000
        20.200000
       396.225000
        16.955000
        25.000000
    
    
      max
        88.976200
       100.000000
        27.740000
         1.000000
         0.871000
         8.780000
       100.000000
        12.126500
        24.000000
       711.000000
        22.000000
       396.900000
        37.970000
        50.000000
    
  

8 rows × 14 columns



In [16]:

    
_ = pd.tools.plotting.scatter_matrix(boston, figsize=(14, 10))



In [17]:

    
factors = ['lstat'] # ['crim','zn', 'indus', 'rm', 'black', 'lstat']
outcome = ['medv']
_ = pd.tools.plotting.scatter_matrix(boston[factors + outcome], figsize=(14, 10))



In [18]:

    
from sklearn import linear_model, neighbors



In [19]:

    
X = boston[factors] # boston[[col for col in boston.columns if col != 'medv']]
y = boston[outcome]
print type(X), type(y)
n_samples, n_features = X.shape
print X.shape, y.shape
print n_samples, n_features









    



<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
(506, 1) (506, 1)
506 1



In [20]:

    
X2 = X.as_matrix()
y2 = y.as_matrix()



In [21]:

    
n_test = int(n_samples * 0.2)
X_test, X_train = X2[:n_test, :], X2[n_test:, :]
y_test, y_train = y2[:n_test], y2[n_test:]
for x in X_test, y_test, X_train, y_train:
    print x.shape, type(x)
assert X_train.shape[0] == y_train.shape[0]









    



(101L, 1L) <type 'numpy.ndarray'>
(101L, 1L) <type 'numpy.ndarray'>
(405L, 1L) <type 'numpy.ndarray'>
(405L, 1L) <type 'numpy.ndarray'>



In [22]:

    
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)









    Out[22]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

Print out the equivalent of R's summary()

http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#example-linear-model-plot-ols-py



In [23]:

    
print 'intercept=%s, coef=%s' % (clf.intercept_, clf.coef_) 
print 'RSS = %f' % np.mean((clf.predict(X_test) - y_test)**2)
print 'R^2 train=%f, test=%f' % (clf.score(X_train, y_train), clf.score(X_test, y_test))
x0, x1 = X_train.min(), X_train.max()
print x0, x1
y0 = clf.predict(x0)[0][0]
y1 = clf.predict(x1)[0][0]
print y0, y1
print (x0, y0), (x1, y1)









    



intercept=[ 35.65963715], coef=[[-0.99684203]]
RSS = 23.739423
R^2 train=0.559654, test=0.316047
1.73 37.97
33.9351004407 -2.19045468127
(1.73, 33.935100440662595) (37.969999999999999, -2.1904546812728185)



In [24]:

    
scatter(X_train, y_train, color='b', marker='x')
plot([x0, x1], [y0, y1], 'r-')
xlabel('lstat (Percentage of lower status people')
ylabel('Median Value in $1000')
title('Median Value vs. lstat')









    Out[24]:





<matplotlib.text.Text at 0x24e66710>

Plot predicted vs actual for test data



In [ ]:

    
scatter(y_test, clf.predict(X_test))
print [y_test.min(), y_test.min()], [y_test.max(), y_test.max()]
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r-')
xlabel('Actual median value / $1000')
ylabel('Predicted median value / $1000')
title('Predicted vs. actual for OLS regression on lstat')



In [30]:

    
from sklearn import cross_validation
scores = cross_validation.cross_val_score(clf, X2, y2, cv=5)









    Out[30]:





{'copy_X': True, 'fit_intercept': True, 'normalize': False}



In [35]:

    
scores, scores.mean()









    Out[35]:





(array([ 0.31784807,  0.5406078 ,  0.07608699,  0.42423767,  0.1267687 ]),
 0.29710984600668555)



In [36]:

    
scores2 = cross_validation.cross_val_score(clf, X2, y2, cv=5, scoring='r2')
scores2, scores2.mean()









    Out[36]:





(array([ 0.31784807,  0.5406078 ,  0.07608699,  0.42423767,  0.1267687 ]),
 0.29710984600668555)



In [ ]:

    
def summary(clf, X, y):
    y_pred = clf.predict(X)
    diff = y_pred - y
    std_error = diff.var()
    print 'std err:

http://nbviewer.ipython.org/github/mwaskom/Psych216/blob/master/week5_tutorial.ipynb



In [ ]:

    
from sklearn.lda import LDA
clf = LDA()
from sklearn.cross_validation import Bootstrap
boot = Bootstrap()

	X.crim.zn.indus.chas.nox.rm.age.dis.rad.tax.ptratio.black.lstat.medv
1	1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,29...
2	2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,2...
3	3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,2...
4	4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,2...
5	5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,2...

	Unnamed: 0	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	1	0.00632	18	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	2	0.02731	0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	3	0.02729	0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	4	0.03237	0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	5	0.06905	0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677082	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000