In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.datasets.base import Bunch
import time
import nltk
from learner.adaptive_lr import LogisticRegressionAdaptive
from datautil.textutils import StemTokenizer
from datautil.load_data import load_from_file, split_data
from experiment.experiment_utils import split_data_sentences, clean_html

def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
    import copy
    min_size = 10

    data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)

    print("Data %s" % train)
    print("Data size %s" % len(data.train.data))


    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
    print len(sent_train)
    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
    print expert_data.oracle.train.bow.shape
    
    exp_clf = copy.copy(clf)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = None

    sent_clf = copy.copy(clf)

    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    return exp_clf, data, vct, sent_clf, expert_data

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                  token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

clf = LogisticRegressionAdaptive(penalty='l1', C=1)

exp_clf, data, vct, sent_clf, expert_data = get_data(clf, "imdb", None, None, None, vct, raw=True)  

test = expert_data.sentence.train


Loading existing file... imdb 
Data imdb
Data size 25000
Cleaning text ... 
Cleaning text ... 
Train:25000, Test:25000, 25000
Training Oracle expert
('Spliting into sentences... Limit:', 2)
161480
(161480, 13753)
Training sentence expert
('Spliting into sentences... Limit:', 2)

Oracle Score Distribution

  • Classifier trained on sentences (161k)
  • Tested on sentences (163k)
  • Distribution of the scores $score(s_i )=max⁡ \big[ P_L (y│s_i )\big]$
  • Distribution of $P_L (y=0|s_i )$
  • Oracle produces scores for unseen test sentences

In [63]:
def score_confusion_matrix(predicted, true_labels, labels):
    cm = confusion_matrix(true_labels, predicted, labels=labels)
    print "Predicted -->"
    print "\t" + "\t".join(str(l) for l in np.unique(true_labels))
    for l in np.unique(true_labels):
        print "{}\t{}".format(l,"\t".join(["{}".format(r) for r in cm[l]]))

# Testing the expert
ora_pred = exp_clf.predict(test.bow)
ora_prob = exp_clf.predict_proba(test.bow)
ora_scores = ora_prob.max(axis=1)
labels = [0,1]
score_confusion_matrix(ora_pred, test.target, labels)


Predicted -->
	0	1
0	59137	24909
1	27443	52294

In [87]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import Counter
# mpl.style.use('fivethirtyeight')
#plot the experts score distribution
def normalized(scores):
    c = Counter(scores)
    t = sum(c.values())
    w = [1.* x / t for x in c.values()]
    return c.keys(), w

def plot_histogram_normed(scores, target, title, range_x=np.arange(0,1.01,.1)):
    fig = plt.figure()
    c0 = scores[target==0]
    c1 = scores[target==1]
    print scores.shape
    x0, w0 = normalized(c0)
    x1, w1 = normalized(c1)
    n, bins, patches = plt.hist([x0, x1] , weights=[w0,w1], stacked=True, bins=10, align='mid',label=['y=0', 'y=1'])
    plt.title(title + ' Distribution $P_{L}(y=0|x)$ $y=0$ (mean=%.2f, N=%d)' % (np.mean(scores), len(target)), fontsize=14)
    plt.xlabel("Score $P_{L}(\hat{y}|x)$")
    plt.ylabel("Frequency")
    plt.xticks(range_x)
    plt.legend()

plot_histogram_normed(ora_prob[:,0], test.target, "Oracle")


(163783L,)

Note: The graphs above shows the distribution of $P_L(y=0|x)$ from the oracle. The bin with


In [88]:
# Ploting scores of 
plot_histogram_normed(ora_scores, test.target, "Oracle Scores", range_x=np.arange(.5,1.01,.1))


(163783L,)

Student scores per training size

  • Student sentence classifier is trained on L of sizes ranging [50, 2000]
  • Student is a LR-adaptive
  • Student produces scores for sentences in an unseen test set

In [90]:
# Testing the student sentence classifier
for s in range(500, 501):
    print "Size: %s" % s
    clf.fit(data.train.bow[:s], data.train.target[:s])
    stu_pred = clf.predict(test.bow)
    stu_prob = clf.predict_proba(test.bow)
    scores = stu_prob.max(axis=1)
    
    score_confusion_matrix(stu_pred, test.target, labels)
    plot_histogram_normed(stu_prob[:,0], test.target, "Stundent - |L|="+str(s))


Size: 500
Predicted -->
	0	1
0	40366	43680
1	23338	56399
(163783L,)

In [92]:
plot_histogram_normed(scores, test.target, "Stundent - |L|="+str(s), range_x=np.arange(.5,1.01,.1))


(163783L,)

In [ ]: