[AstroHackWeek 2014, 2016- J. S. Bloom @profjsb]
scikit-learn
on Sloan Digital Sky Survey Data (SDSS). This should work in both Python 2 and Python 3.
It's easiest to grab data from the SDSS skyserver SQL server.
For example to do a basic query to get two types of photometry (aperature and petrosian), corrected for extinction, for 1000 QSO sources with redshifts:
SELECT *,dered_u - mag_u AS diff_u, dered_g - mag_g AS diff_g, dered_r - mag_r AS diff_g, dered_i - mag_i AS diff_i, dered_z - mag_z AS diff_z from (SELECT top 1000 objid, ra, dec, dered_u,dered_g,dered_r,dered_i,dered_z,psfmag_u-extinction_u AS mag_u, psfmag_g-extinction_g AS mag_g, psfmag_r-extinction_r AS mag_r, psfmag_i-extinction_i AS mag_i,psfmag_z-extinction_z AS mag_z,z AS spec_z,dered_u - dered_g AS u_g_color, dered_g - dered_r AS g_r_color,dered_r - dered_i AS r_i_color,dered_i - dered_z AS i_z_color,class FROM SpecPhoto WHERE (class = 'QSO') ) as spSaving this and others like it as a
csv
we can then start to make our data set for classification/regression.
In [1]:
## get the data locally ... I put this on a gist
!curl -k -O https://gist.githubusercontent.com/anonymous/53781fe86383c435ff10/raw/4cc80a638e8e083775caec3005ae2feaf92b8d5b/qso10000.csv
!curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/964d4f52c989428628d42eb6faad5e212e79b665/star1000.csv
!curl -k -O https://gist.githubusercontent.com/anonymous/2984cf01a2485afd2c3e/raw/335cd1953e72f6c7cafa9ebb81b43c47cb757a9d/galaxy1000.csv
In [4]:
## Python 2 backward compatibility
from __future__ import absolute_import, division, print_function, unicode_literals
In [11]:
# For pretty plotting, pandas, sklearn
!conda install pandas seaborn matplotlib scikit-learn==0.17.1 -y
In [6]:
import copy
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['axes.labelsize'] = 20
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
sns.set()
In [7]:
pd.read_csv("qso10000.csv",index_col=0).head()
Out[7]:
Notice that there are several things about this dataset. First, RA and DEC are probably not something we want to use in making predictions: it's the location of the object on the sky. Second, the magnitudes are highly covariant with the colors. So dumping all but one of the magnitudes might be a good idea to avoid overfitting.
In [14]:
usecols = [str(x) for x in ["objid","dered_r","spec_z","u_g_color","g_r_color","r_i_color",
"i_z_color","diff_u",\
"diff_g1","diff_i","diff_z"]]
qsos = pd.read_csv("qso10000.csv",index_col=0,
usecols=usecols)
qso_features = copy.copy(qsos)
qso_redshifts = qsos["spec_z"]
del qso_features["spec_z"]
qso_features.head()
Out[14]:
In [15]:
f, ax = plt.subplots()
bins = ax.hist(qso_redshifts.values)
ax.set_xlabel("redshift", fontsize=18)
ax.set_ylabel("N",fontsize=18)
Out[15]:
Pretty clearly a big cut at around $z=2$.
In [16]:
import matplotlib as mpl
import matplotlib.cm as cm
## truncate the color at z=2.5 just to keep some contrast.
norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5)
cmap = cm.jet
m = cm.ScalarMappable(norm=norm, cmap=cmap)
rez = pd.scatter_matrix(qso_features[0:2000],
alpha=0.2,figsize=[15,15],color=m.to_rgba(qso_redshifts.values))
Egad. Some pretty crazy values for dered_r
and g_r_color
. Let's figure out why.
In [17]:
min(qso_features["dered_r"].values)
Out[17]:
Looks like there are some missing values in the catalog which are set at -9999. Let's zoink those from the dataset for now.
In [18]:
qsos = pd.read_csv("qso10000.csv",index_col=0,
usecols=usecols)
qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)]
qso_features = copy.copy(qsos)
qso_redshifts = qsos["spec_z"]
del qso_features["spec_z"]
In [19]:
rez = pd.scatter_matrix(qso_features[0:2000], alpha=0.2,figsize=[15,15],\
color=m.to_rgba(qso_redshifts.values))
Ok. This looks pretty clean. Let's save this for future use.
In [20]:
qsos.to_csv("qsos.clean.csv")
In [21]:
X = qso_features.values # 9-d feature space
Y = qso_redshifts.values # redshifts
In [22]:
print("feature vector shape=", X.shape)
print("class shape=", Y.shape)
In [23]:
# half of data
import math
half = math.floor(len(Y)/2)
train_X = X[:half]
train_Y = Y[:half]
test_X = X[half:]
test_Y = Y[half:]
Linear Regression
In [24]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
In [ ]:
clf.
In [25]:
# fit the model
clf.fit(train_X, train_Y)
Out[25]:
In [26]:
# now do the prediction
Y_lr_pred = clf.predict(test_X)
# how well did we do?
from sklearn.metrics import mean_squared_error
mse = np.sqrt(mean_squared_error(test_Y,Y_lr_pred)) ; print("MSE",mse)
In [27]:
plt.plot(test_Y,Y_lr_pred - test_Y,'o',alpha=0.1)
plt.title("Linear Regression Residuals - MSE = %.1f" % mse)
plt.xlabel("Spectroscopic Redshift")
plt.ylabel("Residual")
plt.hlines(0,min(test_Y),max(test_Y),color="red")
Out[27]:
In [28]:
# here's the MSE guessing the AVERAGE value
print("naive mse", ((1./len(train_Y))*(train_Y - train_Y.mean())**2).sum())
In [29]:
mean_squared_error?
k-Nearest Neighbor (KNN) Regression
In [30]:
from sklearn import neighbors
from sklearn import preprocessing
X_scaled = preprocessing.scale(X) # many methods work better on scaled X
clf1 = neighbors.KNeighborsRegressor(10)
train_X = X_scaled[:half]
test_X = X_scaled[half:]
clf1.fit(train_X,train_Y)
Out[30]:
In [31]:
Y_knn_pred = clf1.predict(test_X)
mse = mean_squared_error(test_Y,Y_knn_pred) ; print("MSE (KNN)", mse)
plt.plot(test_Y, Y_knn_pred - test_Y,'o',alpha=0.2)
plt.title("k-NN Residuals - MSE = %.1f" % mse)
plt.xlabel("Spectroscopic Redshift")
plt.ylabel("Residual")
plt.hlines(0,min(test_Y),max(test_Y),color="red")
Out[31]:
In [32]:
from sklearn import neighbors
from sklearn import preprocessing
X_scaled = preprocessing.scale(X) # many methods work better on scaled X
train_X = X_scaled[:half]
train_Y = Y[:half]
test_X = X_scaled[half:]
test_Y = Y[half:]
clf1 = neighbors.KNeighborsRegressor(5)
clf1.fit(train_X,train_Y)
Out[32]:
In [33]:
Y_knn_pred = clf1.predict(test_X)
mse = mean_squared_error(test_Y,Y_knn_pred) ; print("MSE=",mse)
plt.scatter(test_Y, Y_knn_pred - test_Y,alpha=0.2)
plt.title("k-NN Residuals - MSE = %.1f" % mse)
plt.xlabel("Spectroscopic Redshift")
plt.ylabel("Residual")
plt.hlines(0,min(test_Y),max(test_Y),color="red")
Out[33]:
Pretty good intro http://blog.yhathq.com/posts/random-forests-in-python.html
In [34]:
from sklearn.ensemble import RandomForestRegressor
clf2 = RandomForestRegressor(n_estimators=100,
criterion='mse', max_depth=None,
min_samples_split=2, min_samples_leaf=1,
max_features='auto', max_leaf_nodes=None,
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0, warm_start=False)
clf2.fit(train_X,train_Y)
Out[34]:
In [35]:
Y_rf_pred = clf2.predict(test_X)
mse = mean_squared_error(test_Y,Y_rf_pred) ; print("MSE",mse)
plt.scatter(test_Y, Y_rf_pred - test_Y,alpha=0.2)
plt.title("RF Residuals - MSE = %.1f" % mse)
plt.xlabel("Spectroscopic Redshift")
plt.ylabel("Residual")
plt.hlines(0,min(test_Y),max(test_Y),color="red")
Out[35]:
In [36]:
from sklearn import cross_validation
In [37]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
from sklearn.cross_validation import cross_val_score
def print_cv_score_summary(model, xx, yy, cv):
scores = cross_val_score(model, xx, yy, cv=cv, n_jobs=1)
print("mean: {:3f}, stdev: {:3f}".format(
np.mean(scores), np.std(scores)))
In [38]:
print_cv_score_summary(clf,X,Y,cv=cross_validation.KFold(len(Y), 5))
In [39]:
print_cv_score_summary(clf,X,Y,
cv=cross_validation.KFold(len(Y),10,shuffle=True,random_state=1))
In [40]:
print_cv_score_summary(clf2,X,Y,
cv=cross_validation.KFold(len(Y),3,shuffle=True,random_state=1))
In [43]:
usecols = [str(x) for x in ["objid","dered_r","u_g_color","g_r_color","r_i_color","i_z_color","diff_u",\
"diff_g1","diff_i","diff_z","class"]]
all_sources = pd.read_csv("qso10000.csv",index_col=0,usecols=usecols)[:1000]
all_sources = all_sources.append(pd.read_csv("star1000.csv",index_col=0,usecols=usecols))
all_sources = all_sources.append(pd.read_csv("galaxy1000.csv",index_col=0,usecols=usecols))
all_sources = all_sources[(all_sources["dered_r"] > -9999) & (all_sources["g_r_color"] > -10) & (all_sources["g_r_color"] < 10)]
all_features = copy.copy(all_sources)
all_label = all_sources["class"]
del all_features["class"]
X = copy.copy(all_features.values)
Y = copy.copy(all_label.values)
In [44]:
all_sources.tail()
Out[44]:
In [45]:
print("feature vector shape=", X.shape)
print("class shape=", Y.shape)
In [46]:
Y[Y=="QSO"] = 0
Y[Y=="STAR"] = 1
Y[Y=="GALAXY"] = 2
Y = list(Y)
Let's look at random forest
In [47]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200,oob_score=True)
clf.fit(X,Y)
Out[47]:
what are the important features in the data?
In [48]:
sorted(zip(all_sources.columns.values,clf.feature_importances_),key=lambda q: q[1],reverse=True)
Out[48]:
In [49]:
clf.oob_score_ ## "Out of Bag" Error
Out[49]:
In [52]:
import numpy as np
from sklearn import svm, datasets
cmap = cm.jet_r
# import some data to play with
plt.figure(figsize=(10,10))
X = all_features.values[:, 1:3] # use only two features for training and plotting purposes
h = 0.02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
svc = svm.SVC(kernel=str('linear'), C=C).fit(X, Y)
rbf_svc = svm.SVC(kernel=str('rbf'), gamma=0.7, C=C).fit(X, Y)
poly_svc = svm.SVC(kernel=str('poly'), degree=3, C=C).fit(X, Y)
lin_svc = svm.LinearSVC(C=C).fit(X, Y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel',
'SVC with RBF kernel',
'SVC with polynomial (degree 3) kernel',
'LinearSVC (linear kernel)']
norm = mpl.colors.Normalize(vmin=min(Y), vmax=max(Y))
m = cm.ScalarMappable(norm=norm, cmap=cmap)
for i, clf in enumerate((svc, rbf_svc, poly_svc, lin_svc)):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z,cmap=cm.Paired)
plt.axis('off')
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=m.to_rgba(Y),cmap=cm.Paired)
plt.title(titles[i])
In [53]:
# fit a support vector machine classifier
from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
# instantiate the SVM object
sdss_svm = svm.SVC()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'kernel':(str('linear'), str('rbf')), \
'gamma':[0.5, 0.3, 0.1, 0.01],
'C':[0.1, 2, 4, 5, 10, 20,30]}
#parameters = {'kernel':('linear', 'rbf')}
# do a grid search to find the highest 3-fold CV zero-one score
svm_tune = grid_search.GridSearchCV(sdss_svm, parameters,\
n_jobs = -1, cv = 3,verbose=1)
svm_opt = svm_tune.fit(X, Y)
# print the best score and estimator
print(svm_opt.best_score_)
print(svm_opt.best_estimator_)
In [54]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)
classifier = svm.SVC(**svm_opt.best_estimator_.get_params())
y_pred = classifier.fit(X_train, y_train).predict(X_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [55]:
# instantiate the RF learning object
sdss_rf = RandomForestClassifier()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5],
'criterion':[str("gini"),str("entropy")],"min_samples_leaf": [1,2]}
#parameters = {'kernel':('linear', 'rbf')}
# do a grid search to find the highest 3-fold CV zero-one score
rf_tune = grid_search.GridSearchCV(sdss_rf, parameters,\
n_jobs = -1, cv = 3,verbose=1)
rf_opt = rf_tune.fit(X, Y)
# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)
In [56]:
clf.get_params()
Out[56]:
In [57]:
svm_opt.best_estimator_.get_params()
Out[57]:
In [96]:
grid_search.GridSearchCV?
In [58]:
import time
start = time.time()
## this takes about 30 seconds
# instantiate the RF learning object
sdss_rf = RandomForestClassifier()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5],
'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
#parameters = {'kernel':('linear', 'rbf')}
# do a grid search to find the highest 3-fold CV zero-one score
rf_tune = grid_search.GridSearchCV(sdss_rf, parameters,\
n_jobs = -1, cv = 3,verbose=1)
rf_opt = rf_tune.fit(X, Y)
# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)
print("total time in seconds",time.time()- start)
Let's do this without a full search...
In [59]:
import time
start = time.time()
# instantiate the RF learning object
sdss_rf = RandomForestClassifier()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5],
'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
#parameters = {'kernel':('linear', 'rbf')}
# do a grid search to find the highest 3-fold CV zero-one score
rf_tune = grid_search.RandomizedSearchCV(sdss_rf, parameters,\
n_jobs = -1, cv = 3,verbose=1)
rf_opt = rf_tune.fit(X, Y)
# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)
print("total time in seconds",time.time()- start)
In [14]:
!conda install dask distributed -y
In [60]:
import os
myhome = os.getcwd()
In [61]:
os.environ["PYTHONPATH"] = myhome + "/dask-learn"
In [62]:
myhome = !pwd
!git clone https://github.com/dask/dask-learn.git
%cd dask-learn
!git pull
!python setup.py install
In [63]:
from dklearn.grid_search import GridSearchCV as DaskGridSearchCV
In [64]:
import time
start = time.time()
# instantiate the RF learning object
sdss_rf = RandomForestClassifier()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'n_estimators':(10,50,200),"max_features": ["auto",3,5],
'criterion':["gini","entropy"],"min_samples_leaf": [1,2]}
#parameters = {'kernel':('linear', 'rbf')}
# do a grid search to find the highest 3-fold CV zero-one score
rf_tune = DaskGridSearchCV(sdss_rf, parameters,\
cv = 3)
rf_opt = rf_tune.fit(X, Y)
# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)
print("total time in seconds",time.time()- start)
In [17]:
#To do distributed:
#from distributed import Executor
#executor = Executor()
#executor
Out[17]:
It's often of interest to find patterns in the data that you didn't know where there, as an end to itself or as a starting point of exploration.
One approach is to look at individual sources that are mis-classified.
In [176]:
usecols = [str(x) for x in ["objid","dered_r","u_g_color","g_r_color","r_i_color","i_z_color","diff_u",\
"diff_g1","diff_i","diff_z","class"]]
all_sources = pd.read_csv("qso10000.csv",index_col=0,usecols=usecols)[:1000]
all_sources = all_sources.append(pd.read_csv("star1000.csv",index_col=0,usecols=usecols))
all_sources = all_sources.append(pd.read_csv("galaxy1000.csv",index_col=0,usecols=usecols))
all_sources = all_sources[(all_sources["dered_r"] > -9999) & (all_sources["g_r_color"] > -10) & (all_sources["g_r_color"] < 10)]
all_features = copy.copy(all_sources)
all_label = all_sources["class"]
del all_features["class"]
X = copy.copy(all_features.values)
Y = copy.copy(all_label.values)
# instantiate the RF learning object
sdss_rf = RandomForestClassifier()
X = all_features.values
Y = all_label.values
# parameter values over which we will search
parameters = {'n_estimators':(100,),"max_features": ["auto",3,4],
'criterion':["entropy"],"min_samples_leaf": [1,2]}
# do a grid search to find the highest 5-fold CV zero-one score
rf_tune = grid_search.GridSearchCV(sdss_rf, parameters,\
n_jobs = -1, cv = 5,verbose=1)
rf_opt = rf_tune.fit(X, Y)
# print the best score and estimator
print(rf_opt.best_score_)
print(rf_opt.best_estimator_)
In [177]:
probs = rf_opt.best_estimator_.predict_proba(X)
print(rf_opt.best_estimator_.classes_)
for i in range(probs.shape[0]):
if rf_opt.best_estimator_.classes_[np.argmax(probs[i,:])] != Y[i]:
print("Label={0:6s}".format(Y[i]), end=" ")
print("Pgal={0:0.3f} Pqso={1:0.3f} Pstar={2:0.3f}".format(probs[i,0],probs[i,1],probs[i,2]),end=" ")
print("http://skyserver.sdss.org/dr12/en/tools/quicklook/summary.aspx?id=" + str(all_sources.index[i]))
We can also do manifold learning to be able to project structure in lower dimensions.
In [178]:
from sklearn import (manifold, datasets, decomposition, ensemble,
discriminant_analysis, random_projection)
In [179]:
rp = random_projection.SparseRandomProjection(n_components=2, density=0.3, random_state=1)
X_projected = rp.fit_transform(X)
In [180]:
Y[Y=="QSO"] = 0
Y[Y=="STAR"] = 1
Y[Y=="GALAXY"] = 2
In [181]:
Yi = Y.astype(np.int64)
In [182]:
plt.title("Manifold Sparse Random Projection")
plt.scatter(X_projected[:, 0], X_projected[:, 1],c=plt.cm.Set1(Yi / 3.),alpha=0.2,
edgecolor='none',s=5*(X[:,0] - np.min(X[:,0])))
Out[182]:
In [183]:
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
X_mds = clf.fit_transform(X)
In [184]:
plt.title("MDS Projection")
plt.scatter(X_mds[:, 0], X_mds[:, 1],c=plt.cm.Set1(Yi / 3.),alpha=0.3,
s=5*(X[:,0] - np.min(X[:,0])))
Out[184]:
In [ ]: