In [28]:
%matplotlib
In [2]:
import sys
import pandas as pa
import numpy as np
from primetext import primetext
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
from autocorrect import spell
st = LancasterStemmer()
In [3]:
pt = primetext()
ytData = pa.read_csv("utubelabled.csv",encoding ='ISO-8859-1')
comments = ytData['comment']
In [4]:
comments = comments.str.replace('','')
In [5]:
def cleanData(records,labels):
output = []
outputLabels = []
recordsChecked = 0
recordsToCheck = len(records)
for index,sentence in enumerate(records):
recordsChecked += 1
sys.stdout.write("\rRecords cleaned : %i / %i" % (recordsChecked,recordsToCheck))
cleanSentence = ''
if len(sentence) < 200:
words = sentence.split(' ')
for word in words:
if len(word) < 12:
if word.isalpha():
cleanSentence += st.stem(spell(word.lower())) + ' '
if cleanSentence:
output.append(cleanSentence.strip())
outputLabels.append(labels[index])
sys.stdout.write("\n")
sys.stdout.flush()
return output,outputLabels
In [6]:
[cleanedRecords, cleanedLabels] = cleanData(comments,ytData['troll'])
In [7]:
pt.index(cleanedRecords)
In [8]:
keyText = []
keyCount = []
for key, value in pt.indexedDictionary.items():
c = pt.countInRecords([key])
keyText.append(key)
keyCount.append(c)
s1 = pa.Series(keyCount,index=keyText)
sortedS1 = s1.sort_values(ascending= False)[:50]
sortedS1.plot.bar()
Out[8]:
In [9]:
df = pa.DataFrame(index=sortedS1.index, columns=sortedS1.index)
df = df.fillna(0)
In [10]:
names = sortedS1.index
colsdone = 0
for col in names:
colsdone += 1
sys.stdout.write("\rCols done : %i" % colsdone)
for row in names:
df[col][row] = pt.countInRecords([col,row])
sys.stdout.write("\n")
sys.stdout.flush()
In [11]:
imgplot = plt.imshow(df,interpolation="nearest")
plt.xticks( range(len(names)), names, rotation=90 )
plt.yticks( range(len(names)), names, rotation=0 )
plt.colorbar()
plt.show()
In [12]:
myLabels = pa.Series(cleanedLabels)
myLabels.sum()
Out[12]:
In [13]:
totalComments = myLabels.count()
totalTrollComments = myLabels.sum()
trollWeight = (totalComments-totalTrollComments)/totalComments
nonTrollWeight = totalTrollComments/totalComments
trollWeight, nonTrollWeight
Out[13]:
In [14]:
# for each troll comment add the troll weight to each word
# for each non troll comment minus the nonTrollWeight from each word
In [15]:
len(pt.cleanedDictionary)
Out[15]:
In [16]:
totalFoundTrolling = myLabels[pt.find(['the'])].sum()
totalFoundTrolling
Out[16]:
In [17]:
totalFoundNotTrolling = pt.find(['the']).sum() - totalFoundTrolling
totalFoundNotTrolling
Out[17]:
In [18]:
trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
trollScore
Out[18]:
In [19]:
trollScores = []
for word in pt.cleanedDictionary:
totalFoundTrolling = myLabels[pt.find([word])].sum()
totalFoundNotTrolling = pt.find([word]).sum() - totalFoundTrolling
trollScore = (totalFoundTrolling * trollWeight) - (totalFoundNotTrolling * nonTrollWeight)
trollScores.append(trollScore)
In [20]:
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)
sortedPos = s2.sort_values(ascending= True)[:100]
sortedPos.plot.bar()
print(sortedPos)
In [29]:
s2 = pa.Series(trollScores,index= pt.cleanedDictionary)
sortedNeg = s2.sort_values(ascending= False)[:100]
sortedNeg.plot.bar()
Out[29]:
In [ ]:
In [22]:
def calTrollScore(comment):
words = str(comment).split(' ')
score = 0
for word in words:
if word in s2:
score += s2[word]
return score
In [23]:
for i in range(10):
print(cleanedRecords[i],calTrollScore(cleanedRecords[i]))
In [24]:
def predictTroll(comment,theta):
return calTrollScore(comment) > theta[0]
def costTrollPredict(theta):
result = list(map(lambda c:predictTroll(c,theta),cleanedRecords))
In [ ]:
In [25]:
pred = costTrollPredict([30])
trueVal = list(map(lambda v: v==1.0,cleanedLabels))
In [26]:
from sklearn.metrics import f1_score
In [27]:
f1_score(trueVal, pred)
In [ ]:
vals = []
for i in range(-30,70):
predt = costTrollPredict([i])
cost = f1_score(trueVal, predt)
vals.append(cost)
plt.plot(vals)
In [ ]:
np.asarray(costTrollPredict([50])).sum()
In [ ]:
output = np.asarray(trueVal)
In [ ]:
pred = np.asarray(costTrollPredict([0]))
In [ ]:
output.sum()
In [ ]:
len(pred)
In [ ]:
output[pred].sum()
In [ ]:
pred
In [ ]:
def calculateEffect(predFunc):
plotLog = []
plotFalse = []
plotx = []
for i in range(-100,100,5):
pred = np.asarray(predFunc([i]))
trollsFound = output[pred].sum()
falsePos = (output == False)[pred].sum()
falsePosPc = ((100/(output==False).sum())*falsePos)
plotLog.append((100/output.sum())*trollsFound)
plotFalse.append(falsePosPc)
plotx.append(i)
return plotLog,plotFalse,plotx
[plotLog,plotFalse,plotx] = calculateEffect(costTrollPredict)
In [ ]:
plotDiff = list(map(lambda a,b: a-b,plotLog,plotFalse ))
plt.title('Plot of % true positives against false positives (Trolls caught)')
plt.plot(plotx,plotLog,c='g')
plt.plot(plotx,plotFalse,c='r')
plt.plot(plotx,plotDiff,c='b')
plt.axvline(0,linestyle = 'dashed')
In [ ]:
usedNeg = sortedNeg[:20]
usedPos = sortedPos[:20]
def calTrollScoreSim(comment):
words = str(comment).split(' ')
score = 0
for word in words:
if word in usedNeg:
score += usedNeg[word]
elif word in usedPos:
score += usedPos[word]
return score
In [ ]:
def predictTroll2(comment,theta):
return calTrollScoreSim(comment) > theta[0]
def costTrollPredict2(theta):
result = list(map(lambda c:predictTroll2(c,theta),cleanedRecords))
return result
In [ ]:
costTrollPredict2([0])
In [ ]:
vals = []
for i in range(0,140,5):
predt = costTrollPredict2([i])
cost = f1_score(trueVal, predt)
vals.append(cost)
plt.plot(vals)
In [ ]:
[plotLog2,plotFalse2,plotx2] = calculateEffect(costTrollPredict2)
In [ ]:
plotDiff2 = list(map(lambda a,b: a-b,plotLog2,plotFalse2 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top 100 polarizing')
plt.plot(plotx2,plotLog2,c='g')
plt.plot(plotx2,plotFalse2,c='r')
plt.plot(plotx2,plotDiff2,c='b')
plt.axvline(0,linestyle = 'dashed')
In [ ]:
def calculateEffectAt0(predFunc):
global usedNeg
global usedPos
plotLog = []
plotFalse = []
plotx = []
for i in range(1,200,5):
usedNeg = sortedNeg[:i]
usedPos = sortedPos[:i]
pred = np.asarray(predFunc([0]))
trollsFound = output[pred].sum()
falsePos = (output == False)[pred].sum()
falsePosPc = ((100/(output==False).sum())*falsePos)
plotLog.append((100/output.sum())*trollsFound)
plotFalse.append(falsePosPc)
plotx.append(i)
return plotLog,plotFalse,plotx
In [ ]:
[plotLog3,plotFalse3,plotx3] = calculateEffectAt0(costTrollPredict2)
In [ ]:
plotDiff3 = list(map(lambda a,b: a-b,plotLog3,plotFalse3 ))
plt.title('Plot of % true positives against false positives (Trolls caught) using top x polarizing')
plt.plot(plotx3,plotLog3,c='g')
plt.plot(plotx3,plotFalse3,c='r')
plt.plot(plotx3,plotDiff3,c='b')
plt.axvline(0,linestyle = 'dashed')
In [ ]: