In [7]:
import pandas as pd
import platform
print 'python', platform.python_version()
print 'numpy', np.version.version
print 'pandas', pd.__version__
np.set_printoptions(linewidth =150)
The pandas R interface does not work yet
In [10]:
import pandas.rpy.common as com
bos = com.load_data('Boston')
bos.head()
Out[10]:
Boston.csv was saved from RStudio
library(MASS)
write.csv("Boston.csv", x=Boston)
We now read it back into a pandas DataFrame
In [11]:
boston = pd.read_csv('data/Boston.csv')
In [12]:
boston.head()
Out[12]:
In [13]:
boston.columns
Out[13]:
In [14]:
del boston['Unnamed: 0']
boston.head()
Out[14]:
In [15]:
boston.describe()
Out[15]:
In [16]:
_ = pd.tools.plotting.scatter_matrix(boston, figsize=(14, 10))
In [17]:
factors = ['lstat'] # ['crim','zn', 'indus', 'rm', 'black', 'lstat']
outcome = ['medv']
_ = pd.tools.plotting.scatter_matrix(boston[factors + outcome], figsize=(14, 10))
In [18]:
from sklearn import linear_model, neighbors
In [19]:
X = boston[factors] # boston[[col for col in boston.columns if col != 'medv']]
y = boston[outcome]
print type(X), type(y)
n_samples, n_features = X.shape
print X.shape, y.shape
print n_samples, n_features
In [20]:
X2 = X.as_matrix()
y2 = y.as_matrix()
In [21]:
n_test = int(n_samples * 0.2)
X_test, X_train = X2[:n_test, :], X2[n_test:, :]
y_test, y_train = y2[:n_test], y2[n_test:]
for x in X_test, y_test, X_train, y_train:
print x.shape, type(x)
assert X_train.shape[0] == y_train.shape[0]
In [22]:
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
Out[22]:
Print out the equivalent of R's summary()
In [23]:
print 'intercept=%s, coef=%s' % (clf.intercept_, clf.coef_)
print 'RSS = %f' % np.mean((clf.predict(X_test) - y_test)**2)
print 'R^2 train=%f, test=%f' % (clf.score(X_train, y_train), clf.score(X_test, y_test))
x0, x1 = X_train.min(), X_train.max()
print x0, x1
y0 = clf.predict(x0)[0][0]
y1 = clf.predict(x1)[0][0]
print y0, y1
print (x0, y0), (x1, y1)
In [24]:
scatter(X_train, y_train, color='b', marker='x')
plot([x0, x1], [y0, y1], 'r-')
xlabel('lstat (Percentage of lower status people')
ylabel('Median Value in $1000')
title('Median Value vs. lstat')
Out[24]:
Plot predicted vs actual for test data
In [ ]:
scatter(y_test, clf.predict(X_test))
print [y_test.min(), y_test.min()], [y_test.max(), y_test.max()]
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r-')
xlabel('Actual median value / $1000')
ylabel('Predicted median value / $1000')
title('Predicted vs. actual for OLS regression on lstat')
In [30]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(clf, X2, y2, cv=5)
Out[30]:
In [35]:
scores, scores.mean()
Out[35]:
In [36]:
scores2 = cross_validation.cross_val_score(clf, X2, y2, cv=5, scoring='r2')
scores2, scores2.mean()
Out[36]:
In [ ]:
def summary(clf, X, y):
y_pred = clf.predict(X)
diff = y_pred - y
std_error = diff.var()
print 'std err:
In [ ]:
from sklearn.lda import LDA
clf = LDA()
from sklearn.cross_validation import Bootstrap
boot = Bootstrap()