In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.datasets.base import Bunch
import time
import nltk
from learner.adaptive_lr import LogisticRegressionAdaptive
from datautil.textutils import StemTokenizer
from datautil.load_data import load_from_file, split_data
from experiment.experiment_utils import split_data_sentences, clean_html
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
import copy
min_size = 10
data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)
print("Data %s" % train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = clean_html(data.train.data)
data.test.data = clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
print len(sent_train)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
print expert_data.oracle.train.bow.shape
exp_clf = copy.copy(clf)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = None
sent_clf = copy.copy(clf)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
return exp_clf, data, vct, sent_clf, expert_data
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
clf = LogisticRegressionAdaptive(penalty='l1', C=1)
exp_clf, data, vct, sent_clf, expert_data = get_data(clf, "imdb", None, None, None, vct, raw=True)
test = expert_data.sentence.train
In [63]:
def score_confusion_matrix(predicted, true_labels, labels):
cm = confusion_matrix(true_labels, predicted, labels=labels)
print "Predicted -->"
print "\t" + "\t".join(str(l) for l in np.unique(true_labels))
for l in np.unique(true_labels):
print "{}\t{}".format(l,"\t".join(["{}".format(r) for r in cm[l]]))
# Testing the expert
ora_pred = exp_clf.predict(test.bow)
ora_prob = exp_clf.predict_proba(test.bow)
ora_scores = ora_prob.max(axis=1)
labels = [0,1]
score_confusion_matrix(ora_pred, test.target, labels)
In [87]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import Counter
# mpl.style.use('fivethirtyeight')
#plot the experts score distribution
def normalized(scores):
c = Counter(scores)
t = sum(c.values())
w = [1.* x / t for x in c.values()]
return c.keys(), w
def plot_histogram_normed(scores, target, title, range_x=np.arange(0,1.01,.1)):
fig = plt.figure()
c0 = scores[target==0]
c1 = scores[target==1]
print scores.shape
x0, w0 = normalized(c0)
x1, w1 = normalized(c1)
n, bins, patches = plt.hist([x0, x1] , weights=[w0,w1], stacked=True, bins=10, align='mid',label=['y=0', 'y=1'])
plt.title(title + ' Distribution $P_{L}(y=0|x)$ $y=0$ (mean=%.2f, N=%d)' % (np.mean(scores), len(target)), fontsize=14)
plt.xlabel("Score $P_{L}(\hat{y}|x)$")
plt.ylabel("Frequency")
plt.xticks(range_x)
plt.legend()
plot_histogram_normed(ora_prob[:,0], test.target, "Oracle")
Note: The graphs above shows the distribution of $P_L(y=0|x)$ from the oracle. The bin with
In [88]:
# Ploting scores of
plot_histogram_normed(ora_scores, test.target, "Oracle Scores", range_x=np.arange(.5,1.01,.1))
In [90]:
# Testing the student sentence classifier
for s in range(500, 501):
print "Size: %s" % s
clf.fit(data.train.bow[:s], data.train.target[:s])
stu_pred = clf.predict(test.bow)
stu_prob = clf.predict_proba(test.bow)
scores = stu_prob.max(axis=1)
score_confusion_matrix(stu_pred, test.target, labels)
plot_histogram_normed(stu_prob[:,0], test.target, "Stundent - |L|="+str(s))
In [92]:
plot_histogram_normed(scores, test.target, "Stundent - |L|="+str(s), range_x=np.arange(.5,1.01,.1))
In [ ]: