In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from sklearn import linear_model

from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', \
                                 alpha=0.01, l1_ratio=0.15, \
                                 fit_intercept=True, n_iter=100000, \
                                 shuffle=False, verbose=0, epsilon=0.1, \
                                 n_jobs=1, random_state=None, learning_rate='optimal', \
                                 eta0=0.0, power_t=0.5, class_weight=None)

In [3]:
y_all = pandas.read_table('../data/label_tr.txt', header=None, sep=' ')

y_all.head()


Out[3]:
0
0 161
1 56
2 119
3 138
4 163

In [4]:
ndim = pandas.read_table('../data/data_tr.txt', sep=' ', header=None, nrows=3).shape[1]

print('ndim = %d'%ndim)
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)

chunks = 50000
nt = 0
for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
    nt += df.shape[0]
    sx += np.sum(df, axis=0)
    ssx += np.sum(df**2, axis=0)

mean_x = sx / float(nt)
var_x  = np.sqrt((ssx - 2*sx*mean_x + mean_x**2) / float(nt))


ndim = 900

In [8]:
featstat = pandas.DataFrame({'mean':mean_x, 'sigma':var_x})
print(featstat.shape)
featstat.to_csv('../data/feat_stats.csv', sep=',')
featstat.head()


(900, 2)
Out[8]:
mean sigma
0 1.172847 1.883212
1 0.453921 1.437891
2 0.746769 1.410678
3 1.557798 1.884574
4 1.215836 1.525495

In [10]:
featstat = pandas.read_csv('../data/feat_stats.csv')
print(featstat.head())

ic = 136

y = np.empty(shape=y_all.shape[0], dtype=int)

y[np.where(y_all[0] != ic)[0]] = -1
y[np.where(y_all[0] == ic)[0]] = 1

print(y.shape, np.sum(y==1), np.sum(y==-1))

chunks=100000

for i in range(1):
  sys.stdout.write('%d '%(i))
  n = 0
  for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
    n0, n1 = n*chunks, (n+1)*chunks
    if n1 > y.shape[0]:
        n1 = y.shape[0] - n0
    ysub = y[n0:n1]
    #sys.stdout.write('%d (%d-%d) %d\t'%(n, n0, n1, ysub.shape[0]))
    df = (df - mean_x) / var_x
    
    clf.partial_fit(df, ysub, classes=[1,-1], sample_weight=None)
    n += 1


   Unnamed: 0      mean     sigma
0           0  1.172847  1.883212
1           1  0.453921  1.437891
2           2  0.746769  1.410678
3           3  1.557798  1.884574
4           4  1.215836  1.525495
((900000,), 4440, 895560)
0 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-10da60a6322c> in <module>()
     24     df = (df - mean_x) / var_x
     25 
---> 26     clf.partial_fit(df, ysub, classes=[1,-1], sample_weight=None)
     27     n += 1

NameError: name 'clf' is not defined

In [33]:
### Reading cross-validation set

Xcv = pandas.read_table('../data/data_cv.txt', sep=' ', header=None)
print(Xcv.shape)

ycv = pandas.read_table('../data/label_cv.txt', sep=' ', header=None)[0].values

ycv[np.where(ycv != ic)[0]] = -1
ycv[np.where(ycv == ic)[0]] = 1

print(Xcv.shape, ycv.shape, np.sum(ycv == 1))


(100000, 900)
((100000, 900), (100000,), 517)

In [8]:
ypred_cv = clf.predict(Xcv)

np.sum(ypred_cv == 1)


Out[8]:
2989

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

prec = precision_score(ycv, ypred_cv)
rec  = recall_score(ycv, ypred_cv)
f1score = f1_score(ycv, ypred_cv)

print(prec, rec, f1score)


/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
(2.8782601417734394e-05, 0.0040299999999999997, 5.6602729016966184e-05)

Logistic Regression


In [26]:
clf = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.00001, C=0.001, \
                                      fit_intercept=True, intercept_scaling=1, class_weight=None, \
                                      random_state=None, solver='liblinear', max_iter=10000, \
                                      multi_class='ovr')

In [27]:
#featstat = pandas.read_csv('../data/feat_stats.csv')
print(featstat.head())

ic = 39

y = np.empty(shape=y_all.shape[0], dtype=int)

y[np.where(y_all[0] != ic)[0]] = -1
y[np.where(y_all[0] == ic)[0]] = 1

print(y.shape, np.sum(y==1), np.sum(y==-1))

chunks=100000

for i in range(1):
  sys.stdout.write('%d '%(i))
  n = 0
  for df in pandas.read_table('../data/data_tr.txt', sep=' ', header=None, iterator=True, chunksize=chunks):
    n0, n1 = n*chunks, (n+1)*chunks
    if n1 > y.shape[0]:
        n1 = y.shape[0] - n0
    ysub = y[n0:n1]
    #sys.stdout.write('%d (%d-%d) %d\t'%(n, n0, n1, ysub.shape[0]))
    df = (df - featstat['mean']) / featstat['sigma']
    
    clf.fit(df, ysub)
    n += 1
    break


   Unnamed: 0      mean     sigma
0           0  1.172847  1.883212
1           1  0.453921  1.437891
2           2  0.746769  1.410678
3           3  1.557798  1.884574
4           4  1.215836  1.525495
((900000,), 1261, 898739)
0 

In [16]:
### Reading cross-validation set

Xcv = pandas.read_table('../data/data_cv.txt', sep=' ', header=None)
print(Xcv.shape)

ycv = pandas.read_table('../data/label_cv.txt', sep=' ', header=None)[0]

ycv[np.where(ycv==ic)[0]] = -1
ycv[np.where(ycv==ic)[0]] = 1

print(Xcv.shape, ycv.shape, np.sum(ycv == 1))


(100000, 900)
((100000, 900), (100000,), 130)

In [28]:
ypred_cv = clf.predict(Xcv)


prec = precision_score(ycv, ypred_cv)
rec  = recall_score(ycv, ypred_cv)
f1score = f1_score(ycv, ypred_cv)

print(prec, rec, f1score)

print(np.sum(ypred_cv == 1), np.sum(ycv == 1))


/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1082: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1172: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
(2.4855171034206842e-05, 0.0049800000000000001, 4.9463469978801547e-05)
(20, 130)

In [ ]: