notebook.community

Edit and run



In [2]:

    
import json
import pickle as cPickle
import numpy as np

from sklearn import svm
import sklearn.utils
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
import random
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr


from collections import defaultdict
import math
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.stats import spearmanr
from sklearn import linear_model
import re
import copy

import seaborn as sns
import pandas as pd
import scipy.stats
import statsmodels.stats.proportion
from sklearn.cross_validation import LeaveOneOut
from prediction_utils.show_examples import update, generate_snapshots, clean
from prediction_utils.features2vec import _get_term_features, _get_last_n_action_features, \
            _get_action_features, _get_repeatition_features, _get_balance_features, documents2feature_vectors



In [3]:

    
import matplotlib
from matplotlib.ticker import FuncFormatter

def to_percent(y, position):
    # Ignore the passed in position. This has the effect of scaling the default
    # tick locations.
    s = str(int(100 * y))

    # The percent symbol needs escaping in latex
    if matplotlib.rcParams['text.usetex'] is True:
        return s + r'$\%$'
    else:
        return s + '%'



In [14]:

    
COLOR = ["#bb5f4c",
"#8e5db0",
"#729b57"]



In [18]:

    
import matplotlib.pyplot as plt
import numpy as np

plt.rcdefaults()
fig, ax = plt.subplots(figsize=(8, 4))

# Example data
# conv+user C = 0.007
# BOW C= 0.00007
# Human 1 C = 0.0007
# FULL C = 0.0007
# User only C = 0.005
# conv only C = 0.005

methods = ('Human', 'Human Perception',  'BOW',   \
            'Conversational + Question', 'Conversational only', \
             'Participant Features')
y_pos = np.arange(len(methods))
performance = ( 0.595, 0.551, 0.554, 0.578, 0.564, 0.530)
err = [0.017,0.011, 0.011, 0.011, 0.011, 0.01]

barwidth = 0.5
gap = 0.1
ax.barh(y_pos * (barwidth + gap), performance, barwidth, xerr=err, align='center',
        color=[COLOR[c] for c in [0, 0, 1, 1, 1, 2]], ecolor='black')
ax.set_xlim(right=0.80)
upperbound = 0.759


fontsize=13
for i in range(len(methods)):
    text = '%.1f'%(performance[i]* 100) + '%'
    if methods[i] == 'Human Perception':
        text += '$\dag$'
    if methods[i] in ['BOW']:
        text += '*'
    if methods[i] == 'Participant Features':
        text += '***'
    ax.text( upperbound - 0.01, i * (barwidth+gap), text, fontsize=fontsize, horizontalalignment ='right')
    ax.text( 0.01, i * (barwidth+gap),methods[i], horizontalalignment ='left', fontsize=fontsize, fontweight='bold', color='white')
ax.set_yticks([])
#ax.set_yticklabels(methods)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Accuracy', fontsize=fontsize)
plt.axvline(x=0.5, color='k', linestyle='--')
plt.axvline(x=upperbound, color='k', linestyle='-.')
#plt.axhline(y=1.5, color='k', linestyle='-')
#plt.axhline(y=4.5, color='k', linestyle='-')
formatter = FuncFormatter(to_percent)

# Set the formatter
plt.gca().xaxis.set_major_formatter(formatter)
plt.xticks(fontsize=fontsize)
plt.show()



In [ ]:

    
def plot_profiles1(profiles, ASPECTS, experience=-1):
    catergories = {'Min': 0, 'Max': 1, 'In the Middle': 2, 'Anonymous':3, 'New Comer':4, 'No Gap': 5, 'Bot': 6}
    cats = ['min', 'max', 'in the middle', 'Anonymous', 'New Comer']

    f, ax = plt.subplots(1, figsize=(13,6))
    bar_width = 0.4
    bar_l = [i for i in range(len(ASPECTS))] 
    tick_pos = [i+bar_width for i in bar_l]

    colors = ['pink', 'mediumslateblue', 'steelblue', 'mediumaquamarine', 'darksalmon']
    bads = [[[], [], [], [], [], [], []], [[], [], [], [], [], [], []]]
    total = len(profiles[0])
    alpha=[0.9, 0.3]
    conv_label = ['Offender is ', 'Non-offender is ']
    mins = [[], []]
    cnts = [[[], [], [], [], [], [], []], [[], [], [], [], [], [], []]]
    rects = []
    for clss in [0, 1]:
        for aspect in ASPECTS:
            cur = []
            for ind in range(len(catergories)):
                bads[clss][ind].append(0)
                cnts[clss][ind].append(0)
            for p in profiles[clss]:
            #    if not('experience') in p or p['experience'] <= experience:
            #        continue
                bads[clss][catergories[p[aspect]]][-1] += 1
                cnts[clss][catergories[p[aspect]]][-1] += 1
                if catergories[p[aspect]] == 0:
                    cur.append(1)
                elif catergories[p[aspect]] < 3:
                    cur.append(0)
            mins[clss].append(cur)
        previous = [0 for a in ASPECTS]
        first_three = [0 for a in ASPECTS]
        for bad in bads[clss][:3]:
            for ii, b in enumerate(bad):
                first_three[ii] += b
        for ind,bad in enumerate(bads[clss][:3]):
            for ii, b in enumerate(bad):
                if first_three[ii]: bad[ii] = bad[ii] / first_three[ii]
            bads[clss][ind] = bad
            rects = ax.bar(bar_l, bad, label=conv_label[clss] + cats[ind], bottom = previous, alpha=alpha[clss], \
                color=colors[ind],width=bar_width,edgecolor='white')
            for ind, rect in enumerate(rects):
                ax.text(rect.get_x() + rect.get_width()/2., (bad[ind] / 3 + previous[ind]),
                '%.1f' % (bad[ind]*100) + '%',
                ha='center', va='bottom')
            for ii, b in enumerate(bad):
                previous[ii] += b
      #  ax.legend(loc="upper center", bbox_to_anchor=(1,1), fontsize='large')
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
       ncol=3, mode="expand", borderaxespad=0., fontsize='large')
        bar_l = [b+bar_width for b in bar_l]
        if clss:
            print('Good Total:')
        else:
            print('Bad Total:')
        for ii,aspect in enumerate(ASPECTS):
            print(aspect, first_three[ii])
    ax.set_ylabel("Percentage among All the Cases", fontsize='large')
  #  ax.set_xlabel("Aspect")
    Xticks = ['Proportion replied',\
              'Being replied latency', 'Reply latency', \
            'Age', 'Status', \
            '# edits on Wikipedia']
    plt.xticks([t - bar_width / 2 for t in tick_pos], Xticks, fontsize='large')
  #  ax.set_xlabel("")

    # rotate axis labels
    plt.setp(plt.gca().get_xticklabels(), rotation=20, horizontalalignment='right')
   # plt.title('Who\'s the Attacker')

    # shot plot
    plt.show()
  #  for aspect in ASPECTS:
  #      print(aspect, first_three[0], first_three[1])

    print('Test 1')
    for ind, aspect in enumerate(ASPECTS):
        print(aspect)
        print('Average in Ggap: ', np.mean(mins[1][ind]))
        print('Average of Bgap: ', np.mean(mins[0][ind]))
        if np.mean(mins[1][ind]) == 1 or np.mean(mins[1][ind]) == 0:
            continue
        print(scipy.stats.mannwhitneyu(mins[0][ind], mins[1][ind]))
        print('\n')
    print('Test 2')
    clss = 0
    for ind, aspect in enumerate(ASPECTS):
        print(aspect, ':', scipy.stats.binom_test(cnts[clss][0][ind], cnts[clss][0][ind] + cnts[clss][1][ind]))
     #   print(cnts[clss][0][ind], cnts[clss][1][ind])
    print('\n')
    print('Test 3')
    clss = 1
    for ind, aspect in enumerate(ASPECTS):
        print(aspect, ':', scipy.stats.binom_test(cnts[clss][0][ind], cnts[clss][0][ind] + cnts[clss][1][ind]))