In [1]:
%matplotlib inline
import matplotlib as plt
import numpy as np
from autotagger.stackoverflow.preprocess import load_pickle_sklearn_format
from sklearn import cross_validation,linear_model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
import pickle
In [2]:
X,Y = load_pickle_sklearn_format("1GB_100_features")
In [4]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
Out[4]:
In [6]:
clf = linear_model.LinearRegression()
meta_clf = OneVsRestClassifier(clf)
In [7]:
meta_clf.fit(X_train,Y_train)
Out[7]:
In [8]:
Y_pred = meta_clf.predict(X_test)
In [9]:
# macro average refers to the average f1_score for each label
f1_score(Y_test,Y_pred, average='macro')
Out[9]:
In [11]:
# if we just consider the labels that have had at least one instance predicted,
# our score goes up:
label_scores = f1_score(Y_test,Y_pred,average=None)
valid_label_indices = np.nonzero(label_scores)[0]
f1_score(Y_test,Y_pred,average='macro',labels=valid_label_indices)
Out[11]:
In [12]:
# micro average refers to the average f1_score for each instance
f1_score(Y_test,Y_pred,average='micro')
Out[12]:
In [ ]: