In [3]:
import numpy as np

In [4]:
X_train=[]
Y_train=[]
for line in open('data/shuttle.trn'):
    line=[float(x) for x in line.strip().split()]
    X_train.append(line[:-1])
    Y_train.append(int(line[-1]))
    
X_train=np.array(X_train)
Y_train=np.array(Y_train)

X_test=[]
Y_test=[]
for line in open('data/shuttle.tst'):
    line=[float(x) for x in line.strip().split()]
    X_test.append(line[:-1])
    Y_test.append(int(line[-1]))

X_test=np.array(X_test)
Y_test=np.array(Y_test)

In [5]:
print("Number of samples",len(X_train))
print("Number of dimensions",X_train.shape[1])


Number of samples 43500
Number of dimensions 9

In [6]:
import matplotlib.pyplot as plt

plt.figure()
plt.boxplot(X_train[:,8])
plt.show()



In [7]:
plt.figure()
plt.hist(Y_train)
plt.show()



In [8]:
print(1.0*len(Y_train[Y_train==1])/len(Y_train))


0.7840919540229885

In [9]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train,Y_train)
preds=clf.predict(X_test)

In [10]:
print(preds[:10])


[2 1 1 2 4 1 7 1 1 1]

In [11]:
from sklearn.metrics import classification_report

print(classification_report(Y_test,preds))


             precision    recall  f1-score   support

          1       0.95      0.88      0.92     11478
          2       0.01      0.92      0.02        13
          3       0.11      0.59      0.19        39
          4       0.89      0.54      0.67      2155
          5       0.99      0.82      0.90       809
          6       0.40      1.00      0.57         4
          7       0.00      1.00      0.01         2

avg / total       0.94      0.83      0.88     14500


In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='l2',solver="newton-cg")
clf.fit(X_train,Y_train)
preds=clf.predict(X_test)

In [13]:
print(classification_report(Y_test,preds))


             precision    recall  f1-score   support

          1       0.94      0.99      0.96     11478
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        39
          4       0.90      0.65      0.75      2155
          5       1.00      1.00      1.00       809
          6       0.00      0.00      0.00         4
          7       0.00      0.00      0.00         2

avg / total       0.93      0.94      0.93     14500

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]: