In [3]:
import numpy as np
In [4]:
X_train=[]
Y_train=[]
for line in open('data/shuttle.trn'):
line=[float(x) for x in line.strip().split()]
X_train.append(line[:-1])
Y_train.append(int(line[-1]))
X_train=np.array(X_train)
Y_train=np.array(Y_train)
X_test=[]
Y_test=[]
for line in open('data/shuttle.tst'):
line=[float(x) for x in line.strip().split()]
X_test.append(line[:-1])
Y_test.append(int(line[-1]))
X_test=np.array(X_test)
Y_test=np.array(Y_test)
In [5]:
print("Number of samples",len(X_train))
print("Number of dimensions",X_train.shape[1])
In [6]:
import matplotlib.pyplot as plt
plt.figure()
plt.boxplot(X_train[:,8])
plt.show()
In [7]:
plt.figure()
plt.hist(Y_train)
plt.show()
In [8]:
print(1.0*len(Y_train[Y_train==1])/len(Y_train))
In [9]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,Y_train)
preds=clf.predict(X_test)
In [10]:
print(preds[:10])
In [11]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,preds))
In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l2',solver="newton-cg")
clf.fit(X_train,Y_train)
preds=clf.predict(X_test)
In [13]:
print(classification_report(Y_test,preds))
In [ ]: