In [12]:
    
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.datasets.base import Bunch
import time
import nltk
from learner.adaptive_lr import LogisticRegressionAdaptive
from datautil.textutils import StemTokenizer
from datautil.load_data import load_from_file, split_data
from experiment.experiment_utils import split_data_sentences, clean_html
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
    import copy
    min_size = 10
    data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)
    print("Data %s" % train)
    print("Data size %s" % len(data.train.data))
    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)
    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset
    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()
    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)
    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target
    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target
    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)
    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")
    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
    print len(sent_train)
    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
    print expert_data.oracle.train.bow.shape
    
    exp_clf = copy.copy(clf)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)
    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
    sent_clf = None
    sent_clf = copy.copy(clf)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
    return exp_clf, data, vct, sent_clf, expert_data
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                  token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
clf = LogisticRegressionAdaptive(penalty='l1', C=1)
exp_clf, data, vct, sent_clf, expert_data = get_data(clf, "imdb", None, None, None, vct, raw=True)  
test = expert_data.sentence.train
    
    
In [63]:
    
def score_confusion_matrix(predicted, true_labels, labels):
    cm = confusion_matrix(true_labels, predicted, labels=labels)
    print "Predicted -->"
    print "\t" + "\t".join(str(l) for l in np.unique(true_labels))
    for l in np.unique(true_labels):
        print "{}\t{}".format(l,"\t".join(["{}".format(r) for r in cm[l]]))
# Testing the expert
ora_pred = exp_clf.predict(test.bow)
ora_prob = exp_clf.predict_proba(test.bow)
ora_scores = ora_prob.max(axis=1)
labels = [0,1]
score_confusion_matrix(ora_pred, test.target, labels)
    
    
In [87]:
    
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import Counter
# mpl.style.use('fivethirtyeight')
#plot the experts score distribution
def normalized(scores):
    c = Counter(scores)
    t = sum(c.values())
    w = [1.* x / t for x in c.values()]
    return c.keys(), w
def plot_histogram_normed(scores, target, title, range_x=np.arange(0,1.01,.1)):
    fig = plt.figure()
    c0 = scores[target==0]
    c1 = scores[target==1]
    print scores.shape
    x0, w0 = normalized(c0)
    x1, w1 = normalized(c1)
    n, bins, patches = plt.hist([x0, x1] , weights=[w0,w1], stacked=True, bins=10, align='mid',label=['y=0', 'y=1'])
    plt.title(title + ' Distribution $P_{L}(y=0|x)$ $y=0$ (mean=%.2f, N=%d)' % (np.mean(scores), len(target)), fontsize=14)
    plt.xlabel("Score $P_{L}(\hat{y}|x)$")
    plt.ylabel("Frequency")
    plt.xticks(range_x)
    plt.legend()
plot_histogram_normed(ora_prob[:,0], test.target, "Oracle")
    
    
    
Note: The graphs above shows the distribution of $P_L(y=0|x)$ from the oracle. The bin with
In [88]:
    
# Ploting scores of 
plot_histogram_normed(ora_scores, test.target, "Oracle Scores", range_x=np.arange(.5,1.01,.1))
    
    
    
In [90]:
    
# Testing the student sentence classifier
for s in range(500, 501):
    print "Size: %s" % s
    clf.fit(data.train.bow[:s], data.train.target[:s])
    stu_pred = clf.predict(test.bow)
    stu_prob = clf.predict_proba(test.bow)
    scores = stu_prob.max(axis=1)
    
    score_confusion_matrix(stu_pred, test.target, labels)
    plot_histogram_normed(stu_prob[:,0], test.target, "Stundent - |L|="+str(s))
    
    
    
In [92]:
    
plot_histogram_normed(scores, test.target, "Stundent - |L|="+str(s), range_x=np.arange(.5,1.01,.1))
    
    
    
In [ ]: