In [4]:
import numpy as np
import matplotlib.pyplot as plt
import csv
import pickle
import random as random
import pickle as cPickle
from sklearn import svm
from scipy.sparse import csr_matrix
from features.vectorizer import PolitenessFeatureVectorizer
from sklearn.metrics import classification_report
%matplotlib inline

In [3]:
def plotHistogram(inputCsv, title, b=20):
    scores = []
    with open(inputCsv, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        spamreader.next()
        for row in spamreader:
            scores.append(float(row[-1]))
    
    plt.clf()
    plt.hist(scores, bins=b)
    plt.title(title)
    plt.xlabel("Politeness Score")
    plt.ylabel("# Requests")
    plt.show()
    
plotHistogram('data/wikipedia.annotated.csv', "Wiki Score Distribution", 40)
plotHistogram('data/stack-exchange.annotated.csv', "SE Score Distribution", 50)



In [7]:
PATH = 'data/reddit/pickleFiles/'
DATA_FILES = ['reddit_data.pickle']
redditData = pickle.load(open(PATH + DATA_FILES[0], "rb" ))
print redditData.keys()

redditDics = {}
for key, val in redditData.iteritems():
    print key
    print len(val[1])
    new = []
    count = 0
    for v in val[1]:
        dic = {}
        if v['before']=='[deleted]' or v['before']=='':
            count += 1
            continue
        dic['Request'] = v['before'].encode('ascii','ignore') + ' ' + v['current'].encode('ascii','ignore')
        new.append(dic)
    redditDics[key] = new


['unitedkingdom', 'usnews', 'unitedstates', 'ukpolitics', 'uspolitics', 'uknews']
unitedkingdom
1201
usnews
820
unitedstates
9
ukpolitics
1201
uspolitics
875
uknews
79

In [8]:
print redditDics['unitedstates']


[{'Request': 'no, you wont get killed.. Where are you planning on traveling?'}, {'Request': "But so far as i know you won't be killed because you are Asian in most of the US. Maybe somewhere in the deep south?"}, {'Request': 'http://youtu.be/5hfYJsQAhl0 What part of my post in incorrect?'}, {'Request': 'Further, your point about babies is bizarre. Do children not have a right to be counted in the population?'}, {'Request': "Thank you. I've found this: https://answers.yahoo.com/question/index?qid=20131204014742AAirvOr\n\nApparently this claim was some kind of hoax"}, {'Request': 'Canada. Pueto rico? '}, {'Request': 'Idaho. When was the last time you heard of anything happening in Idaho?'}, {'Request': 'Thanks Chillhardy! What about Maryland?'}]

Need to change X to X.toarra() for some classifiers


In [19]:
def documents2feature_vectors(documents):
#     print "Generating feature vectors"
    X= []
    for d in documents:
        fs = vectorizer.features(d)
        fks = sorted(fs.keys())
        fv = [fs[f] for f in fks]
        X.append(fv)
    X = csr_matrix(np.asarray(X))
    return X

def getScores(sub):
    if len(sub)!=0:
        X = documents2feature_vectors(sub)
        scores = clf.predict(X)
        return scores

MODEL_FILENAME = 'politeness-svm.p'
clf = pickle.load(open(MODEL_FILENAME,'rb'))

vectorizer = PolitenessFeatureVectorizer()
adminRequests = pickle.load(open('data/parsed/adminRequests_parsed.p'))

before, after =[], []
i = 0
for admin, reqs in adminRequests.iteritems():
    if i%50==0:
        print i
    if len(reqs['before'])!=0 and len(reqs['after'])!=0:
        req['beforescores'] = getScores(reqs['before'])
        before.append(req['beforescores'])
        req['afterscores'] = getScores(reqs['after'])
        after.append(req['afterscores'])
    i+=1


0
50
50
100
100
100
150
150
200
200
200
250
250
300
300
300
300

In [25]:
a_mean, b_mean= [], []
for a in after:
    a_mean.append(np.mean(a))
for b in before:
    b_mean.append(np.mean(b))

print "Average admin politeness:\n"
print "Before Elections -\t", np.mean(b_mean)
print "After Elections -\t", np.mean(a_mean)


Average admin politeness:

Before Elections -	0.609205391601
After Elections -	0.522061694135

In [ ]: