In [7]:
import pandas as pd

import platform
print 'python', platform.python_version()
print 'numpy', np.version.version
print 'pandas', pd.__version__

np.set_printoptions(linewidth =150)


2.7.3
numpy 1.7.1
pandas 0.13.0

The pandas R interface does not work yet

http://pandas.pydata.org/pandas-docs/dev/r_interface.html


In [10]:
import pandas.rpy.common as com
bos = com.load_data('Boston')
bos.head()


Out[10]:
X.crim.zn.indus.chas.nox.rm.age.dis.rad.tax.ptratio.black.lstat.medv
1 1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,29...
2 2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,2...
3 3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,2...
4 4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,2...
5 5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,2...

5 rows × 1 columns

Boston.csv was saved from RStudio

library(MASS)
write.csv("Boston.csv", x=Boston)

We now read it back into a pandas DataFrame


In [11]:
boston = pd.read_csv('data/Boston.csv')

In [12]:
boston.head()


Out[12]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

5 rows × 15 columns


In [13]:
boston.columns


Out[13]:
Index([u'Unnamed: 0', u'crim', u'zn', u'indus', u'chas', u'nox', u'rm', u'age', u'dis', u'rad', u'tax', u'ptratio', u'black', u'lstat', u'medv'], dtype='object')

In [14]:
del boston['Unnamed: 0']
boston.head()


Out[14]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

5 rows × 14 columns


In [15]:
boston.describe()


Out[15]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677082 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

8 rows × 14 columns


In [16]:
_ = pd.tools.plotting.scatter_matrix(boston, figsize=(14, 10))



In [17]:
factors = ['lstat'] # ['crim','zn', 'indus', 'rm', 'black', 'lstat']
outcome = ['medv']
_ = pd.tools.plotting.scatter_matrix(boston[factors + outcome], figsize=(14, 10))



In [18]:
from sklearn import linear_model, neighbors

In [19]:
X = boston[factors] # boston[[col for col in boston.columns if col != 'medv']]
y = boston[outcome]
print type(X), type(y)
n_samples, n_features = X.shape
print X.shape, y.shape
print n_samples, n_features


<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
(506, 1) (506, 1)
506 1

In [20]:
X2 = X.as_matrix()
y2 = y.as_matrix()

In [21]:
n_test = int(n_samples * 0.2)
X_test, X_train = X2[:n_test, :], X2[n_test:, :]
y_test, y_train = y2[:n_test], y2[n_test:]
for x in X_test, y_test, X_train, y_train:
    print x.shape, type(x)
assert X_train.shape[0] == y_train.shape[0]


(101L, 1L) <type 'numpy.ndarray'>
(101L, 1L) <type 'numpy.ndarray'>
(405L, 1L) <type 'numpy.ndarray'>
(405L, 1L) <type 'numpy.ndarray'>

In [22]:
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)


Out[22]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [23]:
print 'intercept=%s, coef=%s' % (clf.intercept_, clf.coef_) 
print 'RSS = %f' % np.mean((clf.predict(X_test) - y_test)**2)
print 'R^2 train=%f, test=%f' % (clf.score(X_train, y_train), clf.score(X_test, y_test))
x0, x1 = X_train.min(), X_train.max()
print x0, x1
y0 = clf.predict(x0)[0][0]
y1 = clf.predict(x1)[0][0]
print y0, y1
print (x0, y0), (x1, y1)


intercept=[ 35.65963715], coef=[[-0.99684203]]
RSS = 23.739423
R^2 train=0.559654, test=0.316047
1.73 37.97
33.9351004407 -2.19045468127
(1.73, 33.935100440662595) (37.969999999999999, -2.1904546812728185)

In [24]:
scatter(X_train, y_train, color='b', marker='x')
plot([x0, x1], [y0, y1], 'r-')
xlabel('lstat (Percentage of lower status people')
ylabel('Median Value in $1000')
title('Median Value vs. lstat')


Out[24]:
<matplotlib.text.Text at 0x24e66710>

Plot predicted vs actual for test data


In [ ]:
scatter(y_test, clf.predict(X_test))
print [y_test.min(), y_test.min()], [y_test.max(), y_test.max()]
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r-')
xlabel('Actual median value / $1000')
ylabel('Predicted median value / $1000')
title('Predicted vs. actual for OLS regression on lstat')

In [30]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(clf, X2, y2, cv=5)


Out[30]:
{'copy_X': True, 'fit_intercept': True, 'normalize': False}

In [35]:
scores, scores.mean()


Out[35]:
(array([ 0.31784807,  0.5406078 ,  0.07608699,  0.42423767,  0.1267687 ]),
 0.29710984600668555)

In [36]:
scores2 = cross_validation.cross_val_score(clf, X2, y2, cv=5, scoring='r2')
scores2, scores2.mean()


Out[36]:
(array([ 0.31784807,  0.5406078 ,  0.07608699,  0.42423767,  0.1267687 ]),
 0.29710984600668555)

In [ ]:
def summary(clf, X, y):
    y_pred = clf.predict(X)
    diff = y_pred - y
    std_error = diff.var()
    print 'std err:

In [ ]:
from sklearn.lda import LDA
clf = LDA()
from sklearn.cross_validation import Bootstrap
boot = Bootstrap()