In this notebook we will look at building machine learning models to predict Pulsar Candidate. The data comes from Rob Lyon at Manchester. This data is publically available. For more information check out https://figshare.com/articles/HTRU2/3080389/1
In [1]:
# For numerical stuff
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 7.0)
# Some preprocessing utilities
from sklearn.cross_validation import train_test_split # Data splitting
from sklearn.utils import shuffle
# The different classifiers
from sklearn.neighbors import KNeighborsClassifier # Nearest Neighbor - Analogizer
from sklearn.naive_bayes import GaussianNB # Bayesian Classifier - Bayesian
from sklearn.neural_network import MLPClassifier # Neural Network - Connectionist
# Model result function
from sklearn.metrics import classification_report,accuracy_score
In [2]:
data = pd.read_csv('Data/pulsar.csv')
# Show some information
print ('Dataset has %d rows and %d columns including features and labels'%(data.shape[0],data.shape[1]))
In [3]:
print (data.columns.values[0:-1])
In [4]:
ax = plt.figure().gca(projection='3d')
ax.scatter3D(data['std_pf'], data['mean_dm'], data['mean_int_pf'],c=data['class'],alpha=.25)
ax.set_xlabel('std_pf')
ax.set_ylabel('mean_dm')
ax.set_zlabel('mean_int_pf')
Out[4]:
In [5]:
# Lets shuffle the rows of the data 10 times
for i in range(10):
data = shuffle(data)
# Now split the dataset into seperate variabels for features and labels
features = data.ix[:,data.columns != 'class'].values # All columns except class
labels = data['class'].values # Class labels
In [6]:
# Do a 70 - 30 split of the whole data for training and testing
# The last argument specifies the fraction of samples for testing
train_data,test_data,train_labels,test_labels = train_test_split(features,labels,test_size=.3)
#Print some info
print ('Number of training data points : %d'%(train_data.shape[0]))
print ('Number of testing data points : %d'%(test_data.shape[0]))
We will be using the following algorithms
In [7]:
# K nearest neighbor
knn = KNeighborsClassifier()
knn.fit(train_data,train_labels)
Out[7]:
In [8]:
# Naive Bayes
nb = GaussianNB()
nb.fit(train_data,train_labels)
Out[8]:
In [9]:
# MLP
mlp = MLPClassifier(solver='sgd',hidden_layer_sizes=(5, 1))
mlp.fit(train_data,train_labels)
Out[9]:
In [10]:
# Pretty function to test a model and print accuracy score
def evaluate(model,modelname,test_data,test_labels):
predictions = model.predict(test_data) # Do the actual prediction
print('====================================================')
print('Classification Report for %s'%modelname)
print('===================================================')
print(classification_report(test_labels,predictions,target_names=['Non Pulsar','Pulsar']))
print('\n The model is %f accurate' %(accuracy_score(test_labels,predictions)*100))
print('====================================================\n\n')
In [11]:
# Making some stuff easy
models =[knn,nb,mlp]
model_names =['KNN','Naive Bayes','Neural Network']
In [12]:
for i in range(0,3):
evaluate(models[i],model_names[i],test_data,test_labels)
In [ ]: