In [1]:
"""
Copyright 2017 Google Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
Out[1]:
In [1]:
import json
import pickle as cPickle
import numpy as np
from sklearn import svm
import sklearn.utils
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
import random
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from collections import defaultdict
import math
import scikits.bootstrap as boot
from sklearn import linear_model
import re
import copy
import pandas as pd
import scipy.stats
from sklearn.cross_validation import LeaveOneOut
from prediction_utils.show_examples import update, generate_snapshots, clean
from prediction_utils.features2vec import documents2feature_vectors, get_features, attacker_profile
from prediction_utils.configure import configure
from prediction_utils.test_svm import train_svm, top_coefficients
from prediction_utils.plotting import attacker_plot, plot_profiles
In [2]:
constraints = ['constraintA+B', 'constraintB']
constraint = constraints[0]
suffix = ''
In [3]:
user_features, ARGS = configure(constraint)
documents = []
with open('data/%s.json'%(constraint)) as f:
ind = 0
for line in f:
conv_id, clss, conversation = json.loads(line)
documents.append((conversation, clss, conv_id))
random.shuffle(documents)
matched_pairs = []
title_dic = defaultdict(list)
for ind, doc in enumerate(documents):
conversatconv_with_questions_scores = train_svm(X, y, 0.5, matched_pairs)ion, clss, conv_id = doc
title_dic[conversation['action_feature'][0]['page_title']].append(ind)
matched_pairs = list(title_dic.values())
print('Number of folds: ', len(matched_pairs))
In [ ]:
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, Conversational=True, Questions=True)
X, y, feature_names = documents2feature_vectors(feature_sets)
print('Conversational Features with Question Features: ')
conv_with_questions_scores = train_svm(X, y, 1.2, matched_pairs)
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, Conversational=True, Questions=False)
X, y, feature_names = documents2feature_vectors(feature_sets)
print('Conversational Features only: ')
conv_only_scores = train_svm(X, y, 0.09, matched_pairs)
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, BOW=True)
X, y, feature_names = documents2feature_vectors(feature_sets)
print('BOW Features Only: ')
bow_scores = train_svm(X, y, 0.0002, matched_pairs)
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, Questions=True, User=True)
X, y, feature_names = documents2feature_vectors(feature_sets)
print('User Features Only: ')
user_only_scores = train_svm(X, y, 0.3, matched_pairs)
print('Significance Test: ')
print('Significance between BOW and conversational feature', scipy.stats.wilcoxon(bow_scores, conv_only_scores))
print('Significance between BOW and conversational + question feature:', scipy.stats.wilcoxon(bow_scores, conv_with_questions_scores))
print('Significance between conversational only and conversational + question feature:', scipy.stats.wilcoxon(conv_only_scores, conv_with_questions_scores))
In [7]:
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, Conversational=True, Questions=True)
In [8]:
def attacker_question_profile(document, user_infos, QUESTION_ASPECTS):
actions = document['action_feature']
end_time = max([action['timestamp_in_sec'] for action in actions])
for action in actions:
if action['timestamp_in_sec'] == end_time:
if 'user_text' in action:
attacker = action['user_text']
else:
attacker = None
activation = ['None', 'Attacker', 'Non-Attacker', 'Both']
ret = {}
for question_type in QUESTION_ASPECTS:
ret[question_type] = 0
attacker_activated = 0
non_attacker_activated = 0
for user,info in user_infos.items():
if info[question_type]:
if user == attacker:
attacker_activated = 1
else:
non_attacker_activated = 2
ret[question_type] = activation[attacker_activated + non_attacker_activated]
return ret
In [9]:
question_profile = {0:[], 1:[]}
QUESTION_ASPECTS = []
for typ in [0, 4, 5]:
QUESTION_ASPECTS.append('question_type%d'%typ)
QUESTION_ASPECTS.append('being_asked_question_type%d'%typ)
for ind, pair in enumerate(documents):
conversation, clss, conv_id = pair
p = attacker_question_profile(conversation, user_info[ind], QUESTION_ASPECTS)
question_profile[clss].append(p)
In [10]:
def plot_profiles1(profiles, ASPECTS, num_catergories = 3, \
catergories = {'Min': 0, 'Max': 1, 'In the Middle': 2, 'Anonymous':3, 'New Comer':4, 'No Gap': 5, 'Bot': 6}, \
cats = ['min', 'max', 'in the middle', 'Anonymous', 'New Comer'], \
catergory_names = ['Proportion replied', 'Being replied latency', 'Reply latency', \
'Age', 'Status', '# questions asked', '# edits on Wikipedia'], \
conv_label = ['Offender is ', 'Non-offender is '], \
experience=-1):
f, ax = plt.subplots(1, figsize=(16,6))
bar_width = 0.4
bar_l = [i for i in range(len(ASPECTS))]
tick_pos = [i+bar_width for i in bar_l]
colors = ['pink', 'mediumslateblue', 'steelblue', 'mediumaquamarine', 'darksalmon']
bads = [[[], [], [], [], [], [], []], [[], [], [], [], [], [], []]]
total = len(profiles[0])
alpha=[0.9, 0.3]
mins = [[], []]
cnts = [[[], [], [], [], [], [], []], [[], [], [], [], [], [], []]]
rects = []
for clss in [0, 1]:
for aspect in ASPECTS:
cur = []
for ind in range(len(catergories)):
bads[clss][ind].append(0)
cnts[clss][ind].append(0)
for p in profiles[clss]:
bads[clss][catergories[p[aspect]]][-1] += 1
cnts[clss][catergories[p[aspect]]][-1] += 1
if catergories[p[aspect]] == 0:
cur.append(1)
elif catergories[p[aspect]] < num_catergories:
cur.append(0)
mins[clss].append(cur)
previous = [0 for a in ASPECTS]
first_three = [0 for a in ASPECTS]
for bad in bads[clss][:num_catergories]:
for ii, b in enumerate(bad):
first_three[ii] += b
for ind,bad in enumerate(bads[clss][:num_catergories]):
for ii, b in enumerate(bad):
if first_three[ii]: bad[ii] = bad[ii] / first_three[ii]
bads[clss][ind] = bad
rects = ax.bar(bar_l, bad, label=conv_label[clss] + cats[ind], bottom = previous, alpha=alpha[clss], \
color=colors[ind], width=bar_width, edgecolor='none', linewidth=0)
for ind, rect in enumerate(rects):
ax.text(rect.get_x() + rect.get_width()/2., (bad[ind] / 3 + previous[ind]),
'%.1f' % (bad[ind]*100) + '%',
ha='center', va='bottom')
for ii, b in enumerate(bad):
previous[ii] += b
ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
ncol=3, mode="expand", borderaxespad=0., fontsize='large')
bar_l = [b+bar_width for b in bar_l]
if clss:
print('Good Total:')
else:
print('Bad Total:')
for ii,aspect in enumerate(ASPECTS):
print(aspect, first_three[ii])
ax.set_ylabel("Percentage among All the Cases", fontsize='large')
Xticks = catergory_names
plt.xticks([t - bar_width / 2 for t in tick_pos], Xticks, fontsize='large')
plt.setp(plt.gca().get_xticklabels(), rotation=20, horizontalalignment='right')
plt.show()
print('Test 1')
for ind, aspect in enumerate(ASPECTS):
print(aspect)
print('Average in Ggap: ', np.mean(mins[1][ind]))
print('Average of Bgap: ', np.mean(mins[0][ind]))
if np.mean(mins[1][ind]) == 1 or np.mean(mins[1][ind]) == 0:
continue
print(scipy.stats.mannwhitneyu(mins[0][ind], mins[1][ind]))
print('\n')
print('Test 2')
clss = 0
for ind, aspect in enumerate(ASPECTS):
print(aspect, ':', scipy.stats.binom_test(cnts[clss][0][ind], cnts[clss][0][ind] + cnts[clss][1][ind]))
print('\n')
print('Test 3')
clss = 1
for ind, aspect in enumerate(ASPECTS):
print(aspect, ':', scipy.stats.binom_test(cnts[clss][0][ind], cnts[clss][0][ind] + cnts[clss][1][ind]))
In [11]:
plot_profiles1(question_profile, QUESTION_ASPECTS, num_catergories=3, \
catergories={'None': 3, 'Attacker': 1, 'Non-Attacker': 2, 'Both':0},\
cats = ['both', 'last participant', 'others', 'None'], \
catergory_names = ['asked type 0 question', 'being asked type 0 question', \
'asked type 4 question', 'being asked type 4 quesion', \
'asked type 5 question', 'being asked type 5 quesion'], \
conv_label = ['Awry: ', 'Normal: '], \
experience=-1)
In [14]:
classifier = svm.LinearSVC(C=1.3)
user_info, starter_attack_profiles, non_starter_attacker_profiles, all_profiles, feature_sets = get_features(user_features, documents, ARGS, Conversational=True, Questions=True)
X, y, feature_names = documents2feature_vectors(feature_sets)
classifier.fit(X, y)
top_pos, top_neg, num_pos, num_neg = top_coefficients(classifier, feature_names, 20)
In [15]:
top_pos[::-1]
Out[15]:
In [9]:
top_neg[::-1]
Out[9]:
In [12]:
attacker_profile_ASPECTS = ARGS[2]
plot_profiles1(all_profiles, attacker_profile_ASPECTS)
In [ ]: