In [ ]:
__author__ = 'mramire8'
__copyright__ = "Copyright 2013, ML Lab"
__version__ = "0.2"
__status__ = "Development"

import sys
import os

sys.path.append(os.path.abspath("."))
sys.path.append(os.path.abspath("../"))
sys.path.append(os.path.abspath("../experiment/"))


from experiment.experiment_utils import split_data_sentences, parse_parameters_mat, clean_html, set_cost_model
import argparse
import numpy as np
from sklearn.datasets.base import Bunch
from datautil.load_data import load_from_file, split_data
from sklearn import linear_model
import time

from collections import defaultdict
from strategy import structured
from expert import baseexpert
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import random
import nltk
from scipy.sparse import vstack
from sklearn import metrics
from learner.adaptive_lr import LogisticRegressionAdaptive
import matplotlib.pyplot as plt


rand = np.random.mtrand.RandomState(args.seed)
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')



def print_features(coef, names):
    """ Print sorted list of non-zero features/weights. """
    print "\n".join('%s/%.2f' % (names[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0)


def get_data(clf, train, cats, fixk, min_size, vct, raw):
    import copy
    min_size = 10

    args.fixk = None

    data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=2)
    print len(sent_train)
    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
    print expert_data.oracle.train.bow.shape
    # exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf = copy.copy(clf)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=2)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = None
    # if args.cheating:
    sent_clf = copy.copy(clf)
    # sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    return exp_clf, data, vct, cost_model, sent_clf,  expert_data


####################### MAIN ####################
def get_sentences_by_method(pool, student, test_sent):
    test_sent = []

    list_pool = list(pool.remaining)
    # indices = rand.permutation(len(pool.remaining))
    # remaining = [list_pool[index] for index in indices]
    target_sent = []
    text_sent = []
    all_scores = []
    order_sent=[]
    for i in list_pool:
        scores, sent_bow, sent_txt, order = student.x_utility(pool.data[i], pool.text[i])
        if isinstance(test_sent, list):
            test_sent = sent_bow
        else:
            test_sent = vstack([test_sent, sent_bow], format='csr')
        text_sent.append(sent_txt)
        target_sent.append(pool.target[i])
        order_sent.append(order)
        all_scores.append(scores)
    return test_sent, target_sent, text_sent, all_scores, order_sent


def calibrate_scores(n_scores, bounds=(.5,1)):
    delta = 1.* (bounds[1] - bounds[0]) / (n_scores -1)
    calibrated = (np.ones(n_scores)*bounds[1]) - (np.array(range(n_scores))*delta)
    return calibrated


def reshape_scores(scores, sent_mat):
    sr = []
    i = 0
    for row in sent_mat:
        sc = []
        for col in row:
            sc.append(scores[i])
            i = i+1
        sr.append(sc)
    return np.array(sr)


def get_sentences_by_method_cal(pool, student, test_sent):
    test_sent = []

    list_pool = list(pool.remaining)
    # indices = rand.permutation(len(pool.remaining))
    # remaining = [list_pool[index] for index in indices]
    target_sent = []
    text_sent = []
    all_scores = []
    docs = []
    for i in list_pool:
        utilities, sent_bow, sent_txt = student.x_utility_cal(pool.data[i], pool.text[i])  # utulity for every sentences in document
        all_scores.extend(utilities) ## every score
        docs.append(sent_bow)  ## sentences for each document
        text_sent.append(sent_txt)  ## text sentences for each document
        target_sent.append(pool.target[i])   # target of every document, ground truth

    ## Calibrate scores
    n = len(all_scores)
    all_scores = np.array(all_scores)
    break_point = 2
    order = all_scores.argsort()[::-1] ## descending order
    ## generate scores equivalent to max prob
    a = calibrate_scores(n/break_point, bounds=(.5,1))
    a = np.append(a, calibrate_scores(n - n/break_point, bounds=(1,.5)))

    ## new scores assigned to original sentence order
    new_scores = np.zeros(n)
    new_scores[order] = a
    cal_scores = reshape_scores(new_scores, docs)

    selected_sent = [np.argmax(row) for row in cal_scores] ## get the sentence of the highest score per document
    selected = [docs[i][k] for i, k in enumerate(selected_sent)]  ## get the bow
    selected_score = [np.max(row) for i, row in enumerate(cal_scores)]  ## get the bow
    test_sent = list_to_sparse(selected)

    return test_sent, np.array(selected_score), selected_sent


def list_to_sparse(selected):
    test_sent = []
    for s in selected:
        if isinstance(test_sent, list):
            test_sent = s
        else:
            test_sent = vstack([test_sent, s], format='csr')
    return test_sent


def get_sentences_by_method_cal_scale(pool, student, test_sent, class_sensitive=True):
    from sklearn import preprocessing 
    test_sent = []

    list_pool = list(pool.remaining)
    # indices = rand.permutation(len(pool.remaining))
    # remaining = [list_pool[index] for index in indices]
    target_sent = []
    text_sent = []
    all_scores = []
    all_p0 = []
    docs = []
    for i in list_pool:
        utilities, sent_bow, sent_txt = student.x_utility_cal(pool.data[i], pool.text[i])  # utulity for every sentences in document
        all_scores.extend(utilities[::-1) ## every score
        docs.append(sent_bow[::-1])  ## sentences for each document
        text_sent.append(sent_txt[::-1])  ## text sentences for each document
        target_sent.append(pool.target[i])   # target of every document, ground truth
        all_p0.extend([student.sent_model.predict_proba(s)[0][0] for s in sent_bow])
    ## Calibrate scores

    n = len(all_scores)
    if n != len(all_p0):
        raise Exception("Oops there is something wrong! We don't have the same size")

    all_p0 = np.array(all_p0)

    order = all_p0.argsort()[::-1] ## descending order
    ## generate scores equivalent to max prob
    ordered_p0 = all_p0[order]
    # class_sensitive = True
    if class_sensitive:
        c0_scores = preprocessing.scale(ordered_p0[ordered_p0 > .5])
        c1_scores = -1. * preprocessing.scale(ordered_p0[ordered_p0 <= .5])
        a = np.concatenate((c0_scores, c1_scores))
    else:
        a = preprocessing.scale(ordered_p0)

    new_scores = np.zeros(n)
    new_scores[order] = a
    cal_scores = reshape_scores(new_scores, docs)
    p0 = reshape_scores(all_p0, docs)

    selected_sent = [np.argmax(row) for row in cal_scores] ## get the sentence of the highest score per document
    selected = [docs[i][k] for i, k in enumerate(selected_sent)]  ## get the bow
    selected_score = [np.max(row) for i, row in enumerate(cal_scores)]  ## get the bow
    selected_cl = [p0[i][k] for i, k in enumerate(selected_sent)]
    test_sent = list_to_sparse(selected)

    return test_sent, np.array(selected_score), selected_cl, selected_sent

from scipy.sparse import diags


def sentence2values(doc_text, sent_detector, score_model, vcn):
        np.set_printoptions(precision=4)
        sents = sent_detector.tokenize(doc_text)
        sents_feat = vcn.transform(sents)
        coef = score_model.coef_[0]
        dc = diags(coef, 0)

        mm = sents_feat * dc  # sentences feature vectors \times diagonal of coeficients. sentences by features
        return mm, sents, sents_feat


def score_top_feat(pool, sent_detector, score_model, vcn):
    test_sent = []

    list_pool = list(pool.remaining)
    # indices = rand.permutation(len(pool.remaining))
    # remaining = [list_pool[index] for index in indices]
    target_sent = []
    for i in list_pool:
        mm, _, sent_bow = sentence2values(pool.text[i], sent_detector, score_model, vcn)
        max_vals = np.argmax(mm.max(axis=1))

        if isinstance(test_sent, list):
            test_sent = sent_bow[max_vals]
        else:
            test_sent = vstack([test_sent, sent_bow[max_vals]], format='csr')
        target_sent.append(pool.target[i])
    return test_sent, target_sent


def score_distribution(calibrated, pool, sent_data, student, sizes, cheating=False, show=False):

    print "Testing size: %s" % len(pool.target)
    print "Class distribution: %s" % (1. * pool.target.sum() / len(pool.target))
    # # Sentence dataset
    train_sent = sent_data.oracle.train
    import copy
    ## don't care utility of document
    student.fn_utility = student.utility_one
    ## only testing distribution with max score of the student sentence model
    fns = [student.score_max]
    # fns = [student.score_rnd]
    results = defaultdict(lambda: [])
    for size in sizes:

        # train underlying sentence classifier of the student
        if not cheating:
            clf_test = copy.copy(student.sent_model)
            clf_test.fit(train_sent.bow[:size], train_sent.target[:size])
            student.set_sentence_model(clf_test)

        clf_test = student.sent_model

        for fn in fns:
            test_sent = []
            student.score = fn

            ## for every document pick a sentence
            if calibrated == 'zscore':
                test_sent, scores, ori_scores, sel_sent = get_sentences_by_method_cal_scale(pool, student, test_sent)
                if show:
                    plot_histogram(sel_sent, "Zcores", show=True)
            elif calibrated == 'uniform':
                test_sent, scores, sel_sent = get_sentences_by_method_cal(pool, student, test_sent)
                if show:
                    plot_histogram(sel_sent, "Uniform", show=True)
            else:
                test_sent, _, _, scores, sel_sent = get_sentences_by_method(pool, student, test_sent)
                if show:
                    plot_histogram(sel_sent, calibrated, show=True)
                # pred_prob = clf_test.predict_proba(test_sent)
                # scores = pred_prob[:,0]

            predict = clf_test.predict(test_sent)
            mname = fn.__name__
            print "-" * 40
            test_name = "caliball-{}-size-{}".format(mname, size)
            print test_name
            if show:
                plot_histogram([scores[pool.target == 0], scores[pool.target == 1]], test_name, show=False)
            score_confusion_matrix(pool.target, predict, [0, 1])
            accu = metrics.accuracy_score(pool.target, predict)
            print "Accu %s \t%s" % (test_name, accu)
            results[size].append(sel_sent)
    return results


def other_distribution(exp_clf, fns, pool, sent_clf, student, vct):
    # # create data for testing method
    # select the first sentence always
    print args.train
    print "Testing size: %s" % len(pool.target)
    print "Class distribution: %s" % (1. * pool.target.sum() / len(pool.target))
    student.fn_utility = student.utility_one
    # clf_test = clf
    # clf_test.fit(pool.data, pool.target)
    # student.set_sentence_model(clf_test)
    clf_test = sent_clf
    offset = 0
    for fn in fns:
        ## firstk
        test_sent = []
        student.score = fn
        test_sent, target_sent, text_sent = get_sentences_by_method(pool, student, test_sent)
        predict = clf_test.predict(test_sent)
        pred_prob = clf_test.predict_proba(test_sent)
        mname = fn.__name__
        plot_histogram(pred_prob[:, 0], mname)
        # print "METHOD: %s" % fn.__name__

        if False:
            print_document(text_sent, offset, method_name=mname, top=500, truth=pool.target,
                           prediction=predict)  #, org_doc=pool.text)
        offset += 500
        # accu = metrics.accuracy_score(pool.target, predict)
        print mname
        score_confusion_matrix(pool.target, predict, [0, 1])
        #print "Accu %s \t%s" % (student.score.__name__, accu)
    if False:  ## show top feature method
        test_sent, target_sent = score_top_feat(pool, sent_detector, exp_clf, vct)
        predict = clf_test.predict(test_sent)

        accu = metrics.accuracy_score(pool.target, predict)
        print "Accu %s \t%s" % (score_top_feat.__name__, accu)


def main():
    test_methods = False
    test_distribution = True
    sent_average = False
    sizes = range(1000, 20000, 5000)

    # vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
    #                       token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())

    vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                      token_pattern='\\b\\w+\\b')  #, tokenizer=StemTokenizer())

    print("Start loading ...")

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(10, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    # clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    clf = LogisticRegressionAdaptive(penalty='l1', C=1)

    exp_clf, data, vct, cost_model, sent_clf, sent_data = get_data(clf, args.train, [categories[0]], args.fixk, min_size, vct, raw=True)  # expert: classifier, data contains train and test
    print "\nExpert: %s " % exp_clf

    print ("Sentences scoring")
    t0 = time.time()
    ### experiment starts

    student = structured.AALStructuredReading(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct,
                                              subpool=250, cost_model=cost_model)
    student.set_score_model(exp_clf)  # expert model
    student.set_sentence_model(sent_clf)  # expert sentence model
    student.limit = 2
    print "Expert: :", exp_clf
    print "Sentence:", sent_clf

    coef = exp_clf.coef_[0]
    feats = vct.get_feature_names()
    print "*" * 60
    # print_features(coef, feats)
    print "*" * 60

    pool = Bunch()
    pool.data = data.train.bow.tocsr()   # full words, for training
    pool.text = data.train.data
    pool.target = data.train.target
    pool.predicted = []
    pool.remaining = range(pool.data.shape[0])  # indices of the pool

    if sent_average:
        print sentences_average(pool, vct)

    fns = [student.score_fk, student.score_max, student.score_rnd, student.score_max_feat, student.score_max_sim]
    fns = [student.score_max]

    if test_methods:
        other_distribution(exp_clf, fns, pool, sent_clf, student, vct)  ## get prob. distribution without calibration

    calibrated = 'random'
    results = []
    if test_distribution:
        from collections import Counter
        ## create data for testing method
        # select the first sentence always
        # for i in range(5):
        for cal in ['uniform', 'zscore']:
            results.append(score_distribution(cal, pool, sent_data, student, [1], cheating=True))
        avg = Counter()
        for r in results:
            c = Counter(r[1][0])
            for k,v in c.iteritems():
                avg[k] += v/10.

        plt.hist(avg.keys(), weights=avg.values(), bins=100, align='mid', alpha=.65)
        plt.title("Random Distributions t=10", fontsize=12)
        plt.xlabel("Sentence Location")
        plt.ylabel("Frequency")
        plt.legend()
        plt.show()


    print "Elapsed time %.3f" % (time.time() - t0)


def score_confusion_matrix(true_labels, predicted, labels):
    cm = metrics.confusion_matrix(true_labels, predicted, labels=labels)
    print "Predicted -->"
    print "\t" + "\t".join(str(l) for l in np.unique(true_labels))
    for l in np.unique(true_labels):
        print "{}\t{}".format(l,"\t".join(["{}".format(r) for r in cm[l]]))
    print "\n{}\t{}".format(cm[0][0]+cm[1][0],cm[0][1]+cm[1][1],)


def plot_histogram(values, title, show=False):

    n, bins, patches = plt.hist(values, stacked=True, bins=100, align='mid',label=['y=0', 'y=1'], alpha=.65)

    # plt.title(title + ' Distribution $P_{L}(y=0|x)$ $y=0$ (mean=%.2f, N=%d)' % (np.mean(values), len(values)), fontsize=12)
    plt.xlabel("$P_{L}(\hat{y}=0|x)$")
    plt.ylabel("Frequency")
    plt.legend()
    plt.savefig(title+".png", bbox_inches="tight", dpi=200, transparent=True)
    if show:
        plt.show()
    else:
        plt.clf()