In [8]:
import cPickle as pickle
from itertools import chain, izip
import os
import re
import discoursegraphs as dg
import discourseinfostat as di
In [2]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
vec = DictVectorizer()
In [3]:
# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)
In [37]:
def pickles2list(pickle_filepaths):
for pickle_filepath in pickle_filepaths:
with open(pickle_filepath) as pickle_file:
yield pickle.load(pickle_file)
def corpus_stats2sklearn_input(corpus_stats):
"""
Parameters
----------
corpus_stats : list(tuple(str, str))
list of (features pickle path, labels pickle path) tuples
"""
feat_files, label_files = zip(*corpus_stats)
samples = pickles2list(feat_files)
labels = pickles2list(label_files)
return chain(*samples), chain(*labels)
In [5]:
%%time
corpus = dg.read_exportxml(di.infostat.TUEBADZ8_FILE)
infostat = di.Infostat(corpus)
In [40]:
# infostat._reset_corpus_iterator()
In [39]:
%%time
corpus_stats = infostat.corpus_stats()
In [41]:
%%time
samples, y = corpus_stats2sklearn_input(corpus_stats)
In [42]:
%%time
X = vec.fit_transform(samples)
In [43]:
y_bool_array = np.array(list(y), dtype=bool)
In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bool_array, test_size=0.1, random_state=23)
In [45]:
clf = LinearSVC()
In [46]:
%%time
clf.fit(X_train, y_train)
Out[46]:
In [47]:
# clf.predict(X_test)
In [48]:
clf.score(X_test, y_test)
Out[48]:
In [49]:
from sklearn.externals import joblib
In [52]:
classifier_path = os.path.expanduser('~/repos/discourseinfostat/results')
joblib.dump(clf, os.path.join(classifier_path, 'linear_svc_clf.pkl'))
Out[52]:
In [53]:
joblib.dump(X_train, os.path.join(classifier_path, 'X_train.pkl'))
Out[53]:
In [54]:
joblib.dump(X_test, os.path.join(classifier_path, 'X_test.pkl'))
Out[54]:
In [55]:
joblib.dump(y_train, os.path.join(classifier_path, 'y_train.pkl'))
Out[55]:
In [56]:
joblib.dump(y_test, os.path.join(classifier_path, 'y_test.pkl'))
Out[56]:
In [58]:
joblib.dump(X, os.path.join(classifier_path, 'X.pkl'))
Out[58]:
In [59]:
joblib.dump(y, os.path.join(classifier_path, 'y.pkl'))
Out[59]:
In [ ]: