This notebook contains solutions to the Exploring Machine Learning Breakout
In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# use seaborn plotting defaults
# If this causes an error, you can comment it out.
import seaborn as sns
sns.set()
In [2]:
from astroML.datasets import fetch_rrlyrae_combined
from sklearn.cross_validation import train_test_split
X, y = fetch_rrlyrae_combined()
# For now, we'll only fit the first two colors
X_train, X_test, y_train, y_test = train_test_split(X, y)
# For the sake of speed, truncate the training data
X_train = X_train[::5]
y_train = y_train[::5]
In [3]:
N_plot = 5000
plt.scatter(X[-N_plot:, 0], X[-N_plot:, 1], c=y[-N_plot:],
edgecolors='none', cmap='RdBu')
plt.xlabel('u-g color')
plt.ylabel('g-r color')
plt.xlim(0.7, 1.4)
plt.ylim(-0.2, 0.4);
Blue points are RR-Lyrae, Red points are main sequence stars.
In [4]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred,
target_names=['MS star', 'RR Lyrae']))
In [5]:
y_test.shape
Out[5]:
OK... this is not too good. With the default parameters, SVC labels everything a main sequence star. This is 99% accurate, but we don't identify any of the RR Lyraes! Zero contamination, but also zero completeness.
After reading around on the scikit-learn website, we might notice that there is a class_weight parameter to the SVC. Let's try setting this to "auto":
In [6]:
clf = SVC(kernel='linear', class_weight='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred,
target_names=['MS star', 'RR Lyrae']))
Better! We find a recall of 99% for RR Lyrae stars: we now identify almost all of them, but at the expense of a lot of contamination (the precision tells us that only 11% of identified RR Lyrae are actual RR Lyrae stars).
We could adjust the C and gamma parameter to try to fine-tune this:
In [7]:
C_values = 10.0 ** np.linspace(-2, 2, 5)
for C in C_values:
print("C = {0:.2e}".format(C))
clf = SVC(kernel='linear', class_weight='auto', C=C)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred,
target_names=['MS star', 'RR Lyrae']))
We see that the fit is very sensitive to the value of $\log(C)$ – this is a general trait of support vector machines. We'll see later in the cross-validation section how to more rigorously analyze what's going on with these sorts of hyperparameters.
In [8]:
from astroML.datasets import fetch_sdss_specgals
In [9]:
data = fetch_sdss_specgals()
# put magnitudes in a matrix
X = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T
y = data['z']
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [10]:
from sklearn.linear_model import LinearRegression
est = LinearRegression()
est.fit(X_train, y_train)
y_pred = est.predict(X_test)
In [11]:
plt.plot(y_test, y_pred, ',k')
plt.plot([0, 1], [0, 1], ':k')
plt.xlim(0, 0.6)
plt.ylim(0, 0.6)
Out[11]:
In [12]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train[::10], y_train[::10])
y_pred = model.predict(X_test)
In [13]:
plt.plot(y_test, y_pred, ',k')
plt.plot([0, 1], [0, 1], ':k')
plt.xlim(0, 0.6)
plt.ylim(0, 0.6)
print("rms = {0}".format(np.sqrt(np.mean((y_test - y_pred) ** 2))))
This is a better RMS (and, by-eye, a much better fit) than we saw above with the simple linear regression.
Let's see how the rms changes as we change the depth of the trees:
In [14]:
max_depths = np.arange(1, 20)
rms_values = []
for max_depth in max_depths:
model = RandomForestRegressor(random_state=0, max_depth=max_depth)
model.fit(X_train[::10], y_train[::10])
y_pred = model.predict(X_test)
rms_values.append(np.sqrt(np.mean((y_test - y_pred) ** 2)))
In [15]:
plt.plot(max_depths, rms_values);
Apparently using a large max_depth leads to a better rms! But we should be careful here: as max_depth gets larger, our tendency to over-fit the data also becomes larger.