In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
from sklearn import linear_model
from sklearn.metrics import precision_score, recall_score, f1_score
In [3]:
clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', \
alpha=0.01, l1_ratio=0.15, \
fit_intercept=True, n_iter=100000, \
shuffle=False, verbose=0, epsilon=0.1, \
n_jobs=1, random_state=None, learning_rate='optimal', \
eta0=0.0, power_t=0.5, class_weight=None)
In [3]:
y_all = pandas.read_table('../data/label_tr.txt', header=None, sep=' ')
y_all.head()
Out[3]:
In [4]:
ndim = pandas.read_table('../data/data_tr.txt', sep=' ', header=None, nrows=3).shape[1]
print('ndim = %d'%ndim)
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)
chunks = 50000
nt = 0
for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
nt += df.shape[0]
sx += np.sum(df, axis=0)
ssx += np.sum(df**2, axis=0)
mean_x = sx / float(nt)
var_x = np.sqrt((ssx - 2*sx*mean_x + mean_x**2) / float(nt))
In [8]:
featstat = pandas.DataFrame({'mean':mean_x, 'sigma':var_x})
print(featstat.shape)
featstat.to_csv('../data/feat_stats.csv', sep=',')
featstat.head()
Out[8]:
In [10]:
featstat = pandas.read_csv('../data/feat_stats.csv')
print(featstat.head())
ic = 136
y = np.empty(shape=y_all.shape[0], dtype=int)
y[np.where(y_all[0] != ic)[0]] = -1
y[np.where(y_all[0] == ic)[0]] = 1
print(y.shape, np.sum(y==1), np.sum(y==-1))
chunks=100000
for i in range(1):
sys.stdout.write('%d '%(i))
n = 0
for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
n0, n1 = n*chunks, (n+1)*chunks
if n1 > y.shape[0]:
n1 = y.shape[0] - n0
ysub = y[n0:n1]
#sys.stdout.write('%d (%d-%d) %d\t'%(n, n0, n1, ysub.shape[0]))
df = (df - mean_x) / var_x
clf.partial_fit(df, ysub, classes=[1,-1], sample_weight=None)
n += 1
In [33]:
### Reading cross-validation set
Xcv = pandas.read_table('../data/data_cv.txt', sep=' ', header=None)
print(Xcv.shape)
ycv = pandas.read_table('../data/label_cv.txt', sep=' ', header=None)[0].values
ycv[np.where(ycv != ic)[0]] = -1
ycv[np.where(ycv == ic)[0]] = 1
print(Xcv.shape, ycv.shape, np.sum(ycv == 1))
In [8]:
ypred_cv = clf.predict(Xcv)
np.sum(ypred_cv == 1)
Out[8]:
In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score
prec = precision_score(ycv, ypred_cv)
rec = recall_score(ycv, ypred_cv)
f1score = f1_score(ycv, ypred_cv)
print(prec, rec, f1score)
In [26]:
clf = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.00001, C=0.001, \
fit_intercept=True, intercept_scaling=1, class_weight=None, \
random_state=None, solver='liblinear', max_iter=10000, \
multi_class='ovr')
In [27]:
#featstat = pandas.read_csv('../data/feat_stats.csv')
print(featstat.head())
ic = 39
y = np.empty(shape=y_all.shape[0], dtype=int)
y[np.where(y_all[0] != ic)[0]] = -1
y[np.where(y_all[0] == ic)[0]] = 1
print(y.shape, np.sum(y==1), np.sum(y==-1))
chunks=100000
for i in range(1):
sys.stdout.write('%d '%(i))
n = 0
for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
n0, n1 = n*chunks, (n+1)*chunks
if n1 > y.shape[0]:
n1 = y.shape[0] - n0
ysub = y[n0:n1]
#sys.stdout.write('%d (%d-%d) %d\t'%(n, n0, n1, ysub.shape[0]))
df = (df - featstat['mean']) / featstat['sigma']
clf.fit(df, ysub)
n += 1
break
In [16]:
### Reading cross-validation set
Xcv = pandas.read_table('../data/data_cv.txt', sep=' ', header=None)
print(Xcv.shape)
ycv = pandas.read_table('../data/label_cv.txt', sep=' ', header=None)[0]
ycv[np.where(ycv==ic)[0]] = -1
ycv[np.where(ycv==ic)[0]] = 1
print(Xcv.shape, ycv.shape, np.sum(ycv == 1))
In [28]:
ypred_cv = clf.predict(Xcv)
prec = precision_score(ycv, ypred_cv)
rec = recall_score(ycv, ypred_cv)
f1score = f1_score(ycv, ypred_cv)
print(prec, rec, f1score)
print(np.sum(ypred_cv == 1), np.sum(ycv == 1))
In [ ]: