In [4]:
import numpy as np
import matplotlib.pyplot as plt
import csv
import pickle
import random as random
import pickle as cPickle
from sklearn import svm
from scipy.sparse import csr_matrix
from features.vectorizer import PolitenessFeatureVectorizer
from sklearn.metrics import classification_report
%matplotlib inline
In [3]:
def plotHistogram(inputCsv, title, b=20):
scores = []
with open(inputCsv, 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
spamreader.next()
for row in spamreader:
scores.append(float(row[-1]))
plt.clf()
plt.hist(scores, bins=b)
plt.title(title)
plt.xlabel("Politeness Score")
plt.ylabel("# Requests")
plt.show()
plotHistogram('data/wikipedia.annotated.csv', "Wiki Score Distribution", 40)
plotHistogram('data/stack-exchange.annotated.csv', "SE Score Distribution", 50)
In [7]:
PATH = 'data/reddit/pickleFiles/'
DATA_FILES = ['reddit_data.pickle']
redditData = pickle.load(open(PATH + DATA_FILES[0], "rb" ))
print redditData.keys()
redditDics = {}
for key, val in redditData.iteritems():
print key
print len(val[1])
new = []
count = 0
for v in val[1]:
dic = {}
if v['before']=='[deleted]' or v['before']=='':
count += 1
continue
dic['Request'] = v['before'].encode('ascii','ignore') + ' ' + v['current'].encode('ascii','ignore')
new.append(dic)
redditDics[key] = new
In [8]:
print redditDics['unitedstates']
Need to change X to X.toarra() for some classifiers
In [19]:
def documents2feature_vectors(documents):
# print "Generating feature vectors"
X= []
for d in documents:
fs = vectorizer.features(d)
fks = sorted(fs.keys())
fv = [fs[f] for f in fks]
X.append(fv)
X = csr_matrix(np.asarray(X))
return X
def getScores(sub):
if len(sub)!=0:
X = documents2feature_vectors(sub)
scores = clf.predict(X)
return scores
MODEL_FILENAME = 'politeness-svm.p'
clf = pickle.load(open(MODEL_FILENAME,'rb'))
vectorizer = PolitenessFeatureVectorizer()
adminRequests = pickle.load(open('data/parsed/adminRequests_parsed.p'))
before, after =[], []
i = 0
for admin, reqs in adminRequests.iteritems():
if i%50==0:
print i
if len(reqs['before'])!=0 and len(reqs['after'])!=0:
req['beforescores'] = getScores(reqs['before'])
before.append(req['beforescores'])
req['afterscores'] = getScores(reqs['after'])
after.append(req['afterscores'])
i+=1
In [25]:
a_mean, b_mean= [], []
for a in after:
a_mean.append(np.mean(a))
for b in before:
b_mean.append(np.mean(b))
print "Average admin politeness:\n"
print "Before Elections -\t", np.mean(b_mean)
print "After Elections -\t", np.mean(a_mean)
In [ ]: