notebook.community

Edit and run



In [68]:

    
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [69]:

    
%matplotlib inline



In [70]:

    
ssc = SQLContext(sc)
tweets = ssc.read.parquet("/tmp/tweet-corpus")
tweets.cache()









    Out[70]:





DataFrame[_1: array<string>, _2: double]



In [71]:

    
tweets.count()









    Out[71]:





5003



In [72]:

    
df = tweets.toPandas()
df.columns = ["tokens", "label"]
df.head()









    Out[72]:






  
    
      
      tokens
      label
    
  
  
    
      0
      [, the case, be the, would be, feel, we had, w...
      1
    
    
      1
      [, satu, tapii, kan, cuma, jam sama, handshak,...
      0
    
    
      2
      [, want, tattoo, want tattoo, i, tattoo , i want]
      0
    
    
      3
      [, maybe, yeah maybe, rawr yeah, rawr, know, k...
      0
    
    
      4
      [, ay grabe, grabe,  , usernam ay, usernam, ay]
      0

Preprocessing

Using `HashingTF`, a simple bag of words model



In [73]:

    
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import LabeledPoint



In [74]:

    
coeff = 3000
hashingTf = HashingTF(coeff)



In [75]:

    
#vectors = sc.parallelize([hashingTf.transform(tokens) for tokens in df._1])
#idf = IDF().fit(vectors)



In [76]:

    
def featurize(tokens): return hashingTf.transform(tokens)
#def tfidf(tokens): return idf.transform(tf(tokens))



In [77]:

    
df['lpoint'] = df.apply(lambda row: LabeledPoint(row['label'], featurize(row['tokens'])), axis=1)
df









    Out[77]:






  
    
      
      tokens
      label
      lpoint
    
  
  
    
      0
      [, the case, be the, would be, feel, we had, w...
      1
      (1.0,(3000,[0,79,91,189,241,344,446,656,762,96...
    
    
      1
      [, satu, tapii, kan, cuma, jam sama, handshak,...
      0
      (0.0,(3000,[0,13,74,203,239,250,256,292,318,51...
    
    
      2
      [, want, tattoo, want tattoo, i, tattoo , i want]
      0
      (0.0,(3000,[0,186,438,1424,1903,2185,2622],[1....
    
    
      3
      [, maybe, yeah maybe, rawr yeah, rawr, know, k...
      0
      (0.0,(3000,[0,116,437,675,1142,1216,1249,1465,...
    
    
      4
      [, ay grabe, grabe,  , usernam ay, usernam, ay]
      0
      (0.0,(3000,[0,960,1216,1321,1709,1833,2954],[1...
    
    
      5
      [, aqui, calor aqui, sempr, tem, calor, aqui s...
      0
      (0.0,(3000,[0,269,276,889,1216,1635,1774,2460,...
    
    
      6
      [, usernam usernam, want, him, i, same, i want...
      0
      (0.0,(3000,[0,254,438,615,695,725,782,1125,121...
    
    
      7
      [, 1990 an, an waktu, masih, ngetwitnya, ngetw...
      0
      (0.0,(3000,[0,11,69,138,252,291,480,584,865,95...
    
    
      8
      [, usernam cant, cant , usernam, cant]
      0
      (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
    
    
      9
      []
      0
      (0.0,(3000,[0],[1.0]))
    
    
      10
      [, gigih cari, cari kat, googl, pink , kat, su...
      0
      (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
    
    
      11
      [999 httpst, fifa, , usernam 999, httpstco4hpv...
      0
      (0.0,(3000,[0,317,452,1106,1216,1321,1375,1448...
    
    
      12
      [, realiz im, httpstcoevokfo3e1, oh, god, my g...
      0
      (0.0,(3000,[0,12,138,271,452,858,905,916,951,1...
    
    
      13
      [want to, , so i, adult thing, dai off, have, ...
      0
      (0.0,(3000,[0,75,133,229,438,542,738,916,967,1...
    
    
      14
      [usernam, , usernam ,  ]
      0
      (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
    
    
      15
      [down, , in, sad , slfl,  slfl, yall dont, die...
      0
      (0.0,(3000,[0,359,509,652,785,851,906,1037,105...
    
    
      16
      [, slow, theyr, but, realli slow, but theyr,  ...
      0
      (0.0,(3000,[0,113,487,712,755,1321,1513,1846,1...
    
    
      17
      [, sleep, i, sleep , i cant, cant sleep, cant]
      0
      (0.0,(3000,[0,588,901,1424,1683,2265,2514],[1....
    
    
      18
      [,  ]
      0
      (0.0,(3000,[0,1321],[1.0,1.0]))
    
    
      19
      [lo lo, , siap mati, hati lo, lo, jual hati, m...
      0
      (0.0,(3000,[0,9,94,135,149,207,258,375,1047,12...
    
    
      20
      [, why, it so, so, it, so nice, out with, hang...
      0
      (0.0,(3000,[0,229,237,596,693,857,916,1165,119...
    
    
      21
      [, laban, vid, ng, laban ng, shakeys , mai vid...
      0
      (0.0,(3000,[0,14,289,552,593,669,957,969,1023,...
    
    
      22
      [giggl, , hurt, lol, hurt , dimpl start, giggl...
      0
      (0.0,(3000,[0,12,25,97,651,727,868,1007,1493,1...
    
    
      23
      [est, liquid, negro, necesita, uno negro, ya, ...
      0
      (0.0,(3000,[69,249,284,321,331,344,476,494,734...
    
    
      24
      [, ok la, cantik, la aku, usernam sebab, sebab...
      0
      (0.0,(3000,[0,140,396,403,430,549,702,781,1038...
    
    
      25
      [, coi5pr88hgvv,  httpst,  , httpstcoi5pr88hgvv]
      0
      (0.0,(3000,[0,1321,1558,2133,2306],[1.0,1.0,1....
    
    
      26
      [, lg nyambung, usernam twenti, usernam naik, ...
      0
      (0.0,(3000,[0,26,131,188,705,805,873,1216,1321...
    
    
      27
      [,  httpst, emphasi on, open, on, emphasi, coz...
      0
      (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
    
    
      28
      [, in, si,  httpst, we in, boat, boat si, the ...
      0
      (0.0,(3000,[0,24,203,254,357,394,501,914,1034,...
    
    
      29
      [, httpstcowgd2rzvtbj,  httpst, cowgd2rzvtbj]
      0
      (0.0,(3000,[0,428,2299,2306],[1.0,1.0,1.0,1.0]))
    
    
      ...
      ...
      ...
      ...
    
    
      4973
      [corner , , good, a dutch, usernam we, it, a, ...
      1
      (1.0,(3000,[0,344,595,762,881,937,1165,1216,14...
    
    
      4974
      [newsjunkieswmo, , children, rowl sai, jk rowl...
      1
      (1.0,(3000,[0,77,99,221,307,862,886,991,1041,1...
    
    
      4975
      [close masih, , masih, sini, join, good, yadon...
      1
      (1.0,(3000,[0,291,536,579,695,762,777,836,1052...
    
    
      4976
      [, for, is, rest, no kathmandu, us, nepal, of ...
      1
      (1.0,(3000,[0,24,214,479,662,692,939,967,1015,...
    
    
      4977
      [colleenstaver wititud, , colleenstaver, coaip...
      1
      (1.0,(3000,[0,109,239,245,626,862,886,1514,158...
    
    
      4978
      [biglaa 3, please, , kita biglaa, 3 , soon, ht...
      1
      (1.0,(3000,[0,106,200,209,221,714,755,764,834,...
    
    
      4979
      [, for, hour displai, displai pictur, on a, my...
      1
      (1.0,(3000,[0,12,148,344,593,686,720,730,760,8...
    
    
      4980
      [, watch, otwolmanilainlov, rt,  pushawardsjad...
      1
      (1.0,(3000,[0,905,925,1030,1126,1186,1248,1321...
    
    
      4981
      [, mac, ani nativ, on an, hard, submiss, be tr...
      1
      (1.0,(3000,[0,12,33,67,105,189,231,341,420,439...
    
    
      4982
      [httpstcowygktqb0ap, tara, , kayo magsit, cavi...
      1
      (1.0,(3000,[0,329,341,440,452,534,747,807,922,...
    
    
      4983
      [wow, , thank you,  awesom, usernam wow, good,...
      1
      (1.0,(3000,[0,94,551,578,613,631,665,762,780,1...
    
    
      4984
      [for the, , for, follow usernam, insight, http...
      1
      (1.0,(3000,[0,44,125,145,180,344,542,613,731,8...
    
    
      4985
      [, support, children, palestinian, our un2opt,...
      1
      (1.0,(3000,[0,214,219,383,533,577,651,705,862,...
    
    
      4986
      [, he so, usernam he, lucki , xx, good, so, lu...
      1
      (1.0,(3000,[0,159,514,678,762,800,891,916,1216...
    
    
      4987
      [, usernam usernam, hello , job, youth, youth ...
      1
      (1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
    
    
      4988
      [, so beauti, interview, tv interview, so,  ja...
      1
      (1.0,(3000,[0,28,194,287,759,799,916,965,1207,...
    
    
      4989
      [, awkward, glitch, hilari, do you, good, do, ...
      1
      (1.0,(3000,[0,42,455,525,681,732,762,952,959,1...
    
    
      4990
      [, jourdanjai, usernam jourdanjai, usernam, jo...
      1
      (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
    
    
      4991
      [e, , e publish, will tweet, is, thanks, tweet...
      1
      (1.0,(3000,[0,116,150,289,388,528,532,782,826,...
    
    
      4992
      [, wala po, usernam wala, ata, at ica, po ata,...
      1
      (1.0,(3000,[0,156,319,341,736,957,1216,1237,14...
    
    
      4993
      [, kakak, birthdai today,  , kakak birthdai, t...
      1
      (1.0,(3000,[0,102,950,1208,1321,2011,2711,2890...
    
    
      4994
      [please, , your, make up, mind please, up, ple...
      1
      (1.0,(3000,[0,213,374,581,613,684,906,1566,181...
    
    
      4995
      [, name, forgot it, thank you, your, it thank,...
      1
      (1.0,(3000,[0,10,176,200,285,306,452,581,596,6...
    
    
      4996
      [, next , what come, what, you know, httpstcow...
      1
      (1.0,(3000,[0,576,652,1167,1321,1749,1770,1818...
    
    
      4997
      [ where, , great time, had, usernam sound, a, ...
      1
      (1.0,(3000,[0,91,344,663,785,903,976,1090,1153...
    
    
      4998
      [, ar conceiv, conceiv, realli hot, children, ...
      1
      (1.0,(3000,[0,101,133,271,487,518,562,682,838,...
    
    
      4999
      [, usernam usernam, usernam me, it, me, me lik...
      1
      (1.0,(3000,[0,876,1165,1216,1406,1779,1829,203...
    
    
      5000
      [, for, in, follow us, pleas keep, touch, us u...
      1
      (1.0,(3000,[0,176,180,214,273,441,519,546,613,...
    
    
      5001
      [, read, great wednesday, insight, have, your,...
      1
      (1.0,(3000,[0,78,100,145,344,542,581,735,784,8...
    
    
      5002
      [, usernam parceli, brilliant mate, parceli us...
      1
      (1.0,(3000,[0,168,600,888,1216,1255,1554,2764,...
    
  

5003 rows × 3 columns

Create train/test split



In [78]:

    
# create boolean mask
msk = np.random.rand(len(df)) < 0.80
train = df[msk]
test = df[~msk]

Distribution of labels of training set



In [79]:

    
_ = sns.countplot(x="label", data=train)

Distribution of labels of test set



In [80]:

    
_ = sns.countplot(x="label", data=test)

Run PCA



In [81]:

    
from pyspark.mllib.feature import PCA



In [82]:

    
df









    Out[82]:






  
    
      
      tokens
      label
      lpoint
    
  
  
    
      0
      [, the case, be the, would be, feel, we had, w...
      1
      (1.0,(3000,[0,79,91,189,241,344,446,656,762,96...
    
    
      1
      [, satu, tapii, kan, cuma, jam sama, handshak,...
      0
      (0.0,(3000,[0,13,74,203,239,250,256,292,318,51...
    
    
      2
      [, want, tattoo, want tattoo, i, tattoo , i want]
      0
      (0.0,(3000,[0,186,438,1424,1903,2185,2622],[1....
    
    
      3
      [, maybe, yeah maybe, rawr yeah, rawr, know, k...
      0
      (0.0,(3000,[0,116,437,675,1142,1216,1249,1465,...
    
    
      4
      [, ay grabe, grabe,  , usernam ay, usernam, ay]
      0
      (0.0,(3000,[0,960,1216,1321,1709,1833,2954],[1...
    
    
      5
      [, aqui, calor aqui, sempr, tem, calor, aqui s...
      0
      (0.0,(3000,[0,269,276,889,1216,1635,1774,2460,...
    
    
      6
      [, usernam usernam, want, him, i, same, i want...
      0
      (0.0,(3000,[0,254,438,615,695,725,782,1125,121...
    
    
      7
      [, 1990 an, an waktu, masih, ngetwitnya, ngetw...
      0
      (0.0,(3000,[0,11,69,138,252,291,480,584,865,95...
    
    
      8
      [, usernam cant, cant , usernam, cant]
      0
      (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
    
    
      9
      []
      0
      (0.0,(3000,[0],[1.0]))
    
    
      10
      [, gigih cari, cari kat, googl, pink , kat, su...
      0
      (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
    
    
      11
      [999 httpst, fifa, , usernam 999, httpstco4hpv...
      0
      (0.0,(3000,[0,317,452,1106,1216,1321,1375,1448...
    
    
      12
      [, realiz im, httpstcoevokfo3e1, oh, god, my g...
      0
      (0.0,(3000,[0,12,138,271,452,858,905,916,951,1...
    
    
      13
      [want to, , so i, adult thing, dai off, have, ...
      0
      (0.0,(3000,[0,75,133,229,438,542,738,916,967,1...
    
    
      14
      [usernam, , usernam ,  ]
      0
      (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
    
    
      15
      [down, , in, sad , slfl,  slfl, yall dont, die...
      0
      (0.0,(3000,[0,359,509,652,785,851,906,1037,105...
    
    
      16
      [, slow, theyr, but, realli slow, but theyr,  ...
      0
      (0.0,(3000,[0,113,487,712,755,1321,1513,1846,1...
    
    
      17
      [, sleep, i, sleep , i cant, cant sleep, cant]
      0
      (0.0,(3000,[0,588,901,1424,1683,2265,2514],[1....
    
    
      18
      [,  ]
      0
      (0.0,(3000,[0,1321],[1.0,1.0]))
    
    
      19
      [lo lo, , siap mati, hati lo, lo, jual hati, m...
      0
      (0.0,(3000,[0,9,94,135,149,207,258,375,1047,12...
    
    
      20
      [, why, it so, so, it, so nice, out with, hang...
      0
      (0.0,(3000,[0,229,237,596,693,857,916,1165,119...
    
    
      21
      [, laban, vid, ng, laban ng, shakeys , mai vid...
      0
      (0.0,(3000,[0,14,289,552,593,669,957,969,1023,...
    
    
      22
      [giggl, , hurt, lol, hurt , dimpl start, giggl...
      0
      (0.0,(3000,[0,12,25,97,651,727,868,1007,1493,1...
    
    
      23
      [est, liquid, negro, necesita, uno negro, ya, ...
      0
      (0.0,(3000,[69,249,284,321,331,344,476,494,734...
    
    
      24
      [, ok la, cantik, la aku, usernam sebab, sebab...
      0
      (0.0,(3000,[0,140,396,403,430,549,702,781,1038...
    
    
      25
      [, coi5pr88hgvv,  httpst,  , httpstcoi5pr88hgvv]
      0
      (0.0,(3000,[0,1321,1558,2133,2306],[1.0,1.0,1....
    
    
      26
      [, lg nyambung, usernam twenti, usernam naik, ...
      0
      (0.0,(3000,[0,26,131,188,705,805,873,1216,1321...
    
    
      27
      [,  httpst, emphasi on, open, on, emphasi, coz...
      0
      (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
    
    
      28
      [, in, si,  httpst, we in, boat, boat si, the ...
      0
      (0.0,(3000,[0,24,203,254,357,394,501,914,1034,...
    
    
      29
      [, httpstcowgd2rzvtbj,  httpst, cowgd2rzvtbj]
      0
      (0.0,(3000,[0,428,2299,2306],[1.0,1.0,1.0,1.0]))
    
    
      ...
      ...
      ...
      ...
    
    
      4973
      [corner , , good, a dutch, usernam we, it, a, ...
      1
      (1.0,(3000,[0,344,595,762,881,937,1165,1216,14...
    
    
      4974
      [newsjunkieswmo, , children, rowl sai, jk rowl...
      1
      (1.0,(3000,[0,77,99,221,307,862,886,991,1041,1...
    
    
      4975
      [close masih, , masih, sini, join, good, yadon...
      1
      (1.0,(3000,[0,291,536,579,695,762,777,836,1052...
    
    
      4976
      [, for, is, rest, no kathmandu, us, nepal, of ...
      1
      (1.0,(3000,[0,24,214,479,662,692,939,967,1015,...
    
    
      4977
      [colleenstaver wititud, , colleenstaver, coaip...
      1
      (1.0,(3000,[0,109,239,245,626,862,886,1514,158...
    
    
      4978
      [biglaa 3, please, , kita biglaa, 3 , soon, ht...
      1
      (1.0,(3000,[0,106,200,209,221,714,755,764,834,...
    
    
      4979
      [, for, hour displai, displai pictur, on a, my...
      1
      (1.0,(3000,[0,12,148,344,593,686,720,730,760,8...
    
    
      4980
      [, watch, otwolmanilainlov, rt,  pushawardsjad...
      1
      (1.0,(3000,[0,905,925,1030,1126,1186,1248,1321...
    
    
      4981
      [, mac, ani nativ, on an, hard, submiss, be tr...
      1
      (1.0,(3000,[0,12,33,67,105,189,231,341,420,439...
    
    
      4982
      [httpstcowygktqb0ap, tara, , kayo magsit, cavi...
      1
      (1.0,(3000,[0,329,341,440,452,534,747,807,922,...
    
    
      4983
      [wow, , thank you,  awesom, usernam wow, good,...
      1
      (1.0,(3000,[0,94,551,578,613,631,665,762,780,1...
    
    
      4984
      [for the, , for, follow usernam, insight, http...
      1
      (1.0,(3000,[0,44,125,145,180,344,542,613,731,8...
    
    
      4985
      [, support, children, palestinian, our un2opt,...
      1
      (1.0,(3000,[0,214,219,383,533,577,651,705,862,...
    
    
      4986
      [, he so, usernam he, lucki , xx, good, so, lu...
      1
      (1.0,(3000,[0,159,514,678,762,800,891,916,1216...
    
    
      4987
      [, usernam usernam, hello , job, youth, youth ...
      1
      (1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
    
    
      4988
      [, so beauti, interview, tv interview, so,  ja...
      1
      (1.0,(3000,[0,28,194,287,759,799,916,965,1207,...
    
    
      4989
      [, awkward, glitch, hilari, do you, good, do, ...
      1
      (1.0,(3000,[0,42,455,525,681,732,762,952,959,1...
    
    
      4990
      [, jourdanjai, usernam jourdanjai, usernam, jo...
      1
      (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
    
    
      4991
      [e, , e publish, will tweet, is, thanks, tweet...
      1
      (1.0,(3000,[0,116,150,289,388,528,532,782,826,...
    
    
      4992
      [, wala po, usernam wala, ata, at ica, po ata,...
      1
      (1.0,(3000,[0,156,319,341,736,957,1216,1237,14...
    
    
      4993
      [, kakak, birthdai today,  , kakak birthdai, t...
      1
      (1.0,(3000,[0,102,950,1208,1321,2011,2711,2890...
    
    
      4994
      [please, , your, make up, mind please, up, ple...
      1
      (1.0,(3000,[0,213,374,581,613,684,906,1566,181...
    
    
      4995
      [, name, forgot it, thank you, your, it thank,...
      1
      (1.0,(3000,[0,10,176,200,285,306,452,581,596,6...
    
    
      4996
      [, next , what come, what, you know, httpstcow...
      1
      (1.0,(3000,[0,576,652,1167,1321,1749,1770,1818...
    
    
      4997
      [ where, , great time, had, usernam sound, a, ...
      1
      (1.0,(3000,[0,91,344,663,785,903,976,1090,1153...
    
    
      4998
      [, ar conceiv, conceiv, realli hot, children, ...
      1
      (1.0,(3000,[0,101,133,271,487,518,562,682,838,...
    
    
      4999
      [, usernam usernam, usernam me, it, me, me lik...
      1
      (1.0,(3000,[0,876,1165,1216,1406,1779,1829,203...
    
    
      5000
      [, for, in, follow us, pleas keep, touch, us u...
      1
      (1.0,(3000,[0,176,180,214,273,441,519,546,613,...
    
    
      5001
      [, read, great wednesday, insight, have, your,...
      1
      (1.0,(3000,[0,78,100,145,344,542,581,735,784,8...
    
    
      5002
      [, usernam parceli, brilliant mate, parceli us...
      1
      (1.0,(3000,[0,168,600,888,1216,1255,1554,2764,...
    
  

5003 rows × 3 columns



In [83]:

    
#lpoints = df['lpoint']
#rdd = sc.parallelize(lpoints.map(lambda point: point.features).tolist())
#pca = PCA(3).fit(rdd)
#df['pca'] = df.apply(lambda row: pca.transform(row['lpoint'].features), axis=1)
#df['pca_0'] = df.apply(lambda row: row['pca'][0], axis=1)
#df['pca_1'] = df.apply(lambda row: row['pca'][1], axis=1)
#viz = df[['label', 'pca_0', 'pca_1']]



In [84]:

    
#_ = sns.pairplot(viz, vars=['pca_0', 'pca_1'], hue="label", size=6.0)

Train a Logistic Regression classifier



In [85]:

    
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint

Let's add a new column with LabeledPoints consisting of TF-IDF vectors.



In [86]:

    
train_rdd = sc.parallelize(train.lpoint)

Now train the logistic regression estimator.



In [87]:

    
lr = LogisticRegressionWithSGD.train(train_rdd, initialWeights=Vectors.zeros(coeff), iterations=200)

Test



In [88]:

    
test['pred'] = test.apply(lambda row: lr.predict(row['lpoint'].features), axis=1)









    



/Users/rene/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [89]:

    
test









    Out[89]:






  
    
      
      tokens
      label
      lpoint
      pred
    
  
  
    
      8
      [, usernam cant, cant , usernam, cant]
      0
      (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
      0
    
    
      10
      [, gigih cari, cari kat, googl, pink , kat, su...
      0
      (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
      0
    
    
      27
      [,  httpst, emphasi on, open, on, emphasi, coz...
      0
      (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
      0
    
    
      41
      [want to, , for, art, cant stop, have heart, m...
      0
      (0.0,(3000,[0,12,25,56,92,191,290,356,438,542,...
      0
    
    
      44
      [, feel, person i, feel comfort, talk to, coul...
      0
      (0.0,(3000,[0,214,307,317,389,432,558,585,608,...
      0
    
    
      46
      [saya haru, , saya, lagi, bro, gimana lagi, la...
      0
      (0.0,(3000,[0,230,583,672,693,781,1050,1244,17...
      0
    
    
      47
      [, usernam disgusting, sad , have, she must, j...
      0
      (0.0,(3000,[0,138,542,681,688,792,916,1216,126...
      0
    
    
      59
      [incident, allah maaf, le, , ya, in, sad incid...
      0
      (0.0,(3000,[0,17,22,210,259,278,441,447,502,90...
      0
    
    
      62
      [, dean,  httpst, care, coxuqjh2lhvm, take, ca...
      0
      (0.0,(3000,[0,289,1103,1131,1196,1549,1572,170...
      0
    
    
      64
      [, tengo que, me tengo, igual, ltima creo, use...
      0
      (0.0,(3000,[0,332,655,732,812,839,857,884,949,...
      1
    
    
      82
      [, for, sorri, im sorri, sorri for, stand berg...
      0
      (0.0,(3000,[0,228,621,693,845,875,985,1015,117...
      0
    
    
      90
      [, httpstcofa31skmwuf,  httpst,  , cofa31skmwuf]
      0
      (0.0,(3000,[0,5,1321,1826,2306],[1.0,1.0,1.0,1...
      0
    
    
      93
      [,  puta, po ako, bigyan,  at, guys, papakatib...
      0
      (0.0,(3000,[0,64,172,283,329,341,372,434,442,5...
      0
    
    
      97
      [, jealou, okai pahal, usernam prihatin, okai,...
      0
      (0.0,(3000,[0,636,859,868,1063,1161,1170,1216,...
      1
    
    
      99
      [, fuck, todai, earli todai, todai , it, earli...
      0
      (0.0,(3000,[0,86,601,1165,1194,1443,1555,1584,...
      0
    
    
      106
      [cokunguc6m2x, , httpstcokunguc6m2x, rin,  htt...
      0
      (0.0,(3000,[0,3,422,1216,1356,1576,1839,1973,2...
      0
    
    
      115
      [want to, , chang, want, chang , idk, usernam ...
      0
      (0.0,(3000,[0,438,1064,1186,1216,1259,1424,190...
      0
    
    
      117
      [, nakabog , eh nakabog, nakabog, usernam naka...
      0
      (0.0,(3000,[0,1055,1198,1216,1590,2509,2539,25...
      0
    
    
      118
      [usernam wong, , terbentuk, terbentuk dari, te...
      0
      (0.0,(3000,[0,445,610,900,931,1216,1485,1586,1...
      1
    
    
      120
      [usernam, , usernam ,  ]
      0
      (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
      1
    
    
      123
      [follow mebtw, niall, your, actual mean, your ...
      0
      (0.0,(3000,[16,44,175,180,312,404,405,581,785,...
      0
    
    
      126
      [naman nyan, aldub16thweeksari httpst, colgtcs...
      0
      (0.0,(3000,[37,227,284,692,764,851,1017,1129,1...
      0
    
    
      138
      [usernam, , usernam ]
      0
      (0.0,(3000,[0,317,1216],[1.0,1.0,1.0]))
      0
    
    
      141
      [, ir , quiero ir, ir, quiero]
      0
      (0.0,(3000,[0,159,1163,1419,1448],[1.0,1.0,1.0...
      0
    
    
      145
      [ur okai, , and feel, feel, soon, hope ur, bet...
      0
      (0.0,(3000,[0,160,176,200,215,221,317,565,609,...
      0
    
    
      155
      [usernam wong, wong httpst, unikernels, have, ...
      0
      (0.0,(3000,[248,419,542,610,685,817,824,979,11...
      1
    
    
      163
      [scare, , win, is, scare me, win a, uk, what, ...
      0
      (0.0,(3000,[0,56,90,200,222,344,364,474,596,67...
      1
    
    
      171
      [, pinapakita, is, parang, ako kc, c meng, pin...
      0
      (0.0,(3000,[0,260,332,333,341,423,440,460,487,...
      0
    
    
      178
      [masih, fav, httpstcou7g2gdbqo7, fav httpst, m...
      0
      (0.0,(3000,[291,1390,1420,1482,2621,2908],[1.0...
      1
    
    
      183
      [, jakarta, kan, usernam azahra, azahra, azahr...
      0
      (0.0,(3000,[0,8,13,137,209,231,513,536,573,663...
      0
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      4871
      [, for, have, wednesdai , usernam thank, a,  ,...
      1
      (1.0,(3000,[0,180,344,542,613,1090,1153,1166,1...
      1
    
    
      4883
      [, usernam you, usernam usernam, you know, goo...
      1
      (1.0,(3000,[0,762,1165,1216,1406,1749,1882,196...
      1
    
    
      4884
      [, for, oh, usernam usernam, that must, must, ...
      1
      (1.0,(3000,[0,180,242,283,344,418,613,858,979,...
      1
    
    
      4885
      [, children, cofpwdvgpyoq,  pointluck1, guides...
      1
      (1.0,(3000,[0,83,262,516,605,862,886,1006,1034...
      0
    
    
      4886
      [, have, your, usernam hi, seen, and you, emai...
      1
      (1.0,(3000,[0,75,147,176,344,542,581,664,742,8...
      1
    
    
      4887
      [n g, i n, c h, , s, s , n, h a, i s, a, a r, ...
      1
      (1.0,(3000,[0,121,266,344,482,552,553,859,1274...
      0
    
    
      4889
      [, want, anyth, anyth thei, ship anyth, peopl ...
      1
      (1.0,(3000,[0,238,278,438,593,746,884,1019,113...
      0
    
    
      4894
      [, good, usernam buda, trjan , buda p, buda, p...
      1
      (1.0,(3000,[0,98,121,488,523,616,762,1216,1858...
      1
    
    
      4897
      [, a, i need, rt, i,  , need, need a, usernam ...
      1
      (1.0,(3000,[0,80,129,344,443,1111,1216,1321,13...
      0
    
    
      4903
      [, usernam handl,  , handl usernam, usernam , ...
      1
      (1.0,(3000,[0,317,936,1216,1255,1321,1479],[1....
      1
    
    
      4905
      [, nettl sting, usernam usernam, xx, nettl,  ,...
      1
      (1.0,(3000,[0,35,76,262,514,891,1216,1321,1655...
      1
    
    
      4907
      [, usernam usernam, hello , job, youth, youth ...
      1
      (1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
      1
    
    
      4912
      [httpstcocs30mrjoik, for the, , for, follow us...
      1
      (1.0,(3000,[0,125,145,180,344,542,613,662,731,...
      1
    
    
      4913
      [, is, cute, so, aww, that, usernam aww, cute ...
      1
      (1.0,(3000,[0,322,916,1162,1216,1239,1312,1316...
      0
    
    
      4914
      [, to connect, usernam usernam, connect, conne...
      1
      (1.0,(3000,[0,125,613,1112,1134,1216,1656,1937...
      1
    
    
      4924
      [, appal, comment on, as a, a, usernam ill, as...
      1
      (1.0,(3000,[0,12,72,176,198,218,253,327,344,45...
      1
    
    
      4929
      [1to, , becom 1to, have, on a, will becom, goo...
      1
      (1.0,(3000,[0,3,61,249,273,344,528,542,619,762...
      1
    
    
      4932
      [, ar conceiv, conceiv, realli hot, children, ...
      1
      (1.0,(3000,[0,101,133,271,442,487,518,562,565,...
      1
    
    
      4940
      [, for, smile , for make, smile, usernam https...
      1
      (1.0,(3000,[0,61,180,613,906,1130,1216,1315,13...
      1
    
    
      4942
      [, usernam crypto, good, at least, so, you at,...
      1
      (1.0,(3000,[0,597,638,652,692,916,917,957,1006...
      1
    
    
      4947
      [,  sapn, saro, usernam usernam, but, ne , ms ...
      1
      (1.0,(3000,[0,6,121,127,241,283,309,327,530,64...
      1
    
    
      4949
      [, love it, it,  , still love, love, still, us...
      1
      (1.0,(3000,[0,432,868,1165,1216,1321,1406,1668...
      1
    
    
      4957
      [a job, for the, , for,  great, see a, austin,...
      1
      (1.0,(3000,[0,3,31,106,293,344,381,524,528,613...
      1
    
    
      4962
      [, men , good, just, men, i just, i, love, jus...
      1
      (1.0,(3000,[0,271,299,762,924,1424,1495,1668,2...
      1
    
    
      4965
      [, for, insight, have, cov01enzzvd1, ne money,...
      1
      (1.0,(3000,[0,145,180,344,542,613,688,1021,109...
      1
    
    
      4975
      [close masih, , masih, sini, join, good, yadon...
      1
      (1.0,(3000,[0,291,536,579,695,762,777,836,1052...
      1
    
    
      4977
      [colleenstaver wititud, , colleenstaver, coaip...
      1
      (1.0,(3000,[0,109,239,245,626,862,886,1514,158...
      1
    
    
      4985
      [, support, children, palestinian, our un2opt,...
      1
      (1.0,(3000,[0,214,219,383,533,577,651,705,862,...
      1
    
    
      4990
      [, jourdanjai, usernam jourdanjai, usernam, jo...
      1
      (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
      0
    
    
      4998
      [, ar conceiv, conceiv, realli hot, children, ...
      1
      (1.0,(3000,[0,101,133,271,487,518,562,682,838,...
      1
    
  

989 rows × 4 columns

Metrics



In [90]:

    
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics



In [91]:

    
scoreAndLabels = test.apply(lambda row: (float(row['pred']), row['lpoint'].label), axis=1)
scoreAndLabels = sc.parallelize(scoreAndLabels)



In [92]:

    
binary_metrics = BinaryClassificationMetrics(scoreAndLabels)



In [93]:

    
binary_metrics.areaUnderPR









    Out[93]:





0.7816544946099755



In [94]:

    
binary_metrics.areaUnderROC









    Out[94]:





0.7012045925089404



In [95]:

    
mult_metrics = MulticlassMetrics(scoreAndLabels)



In [96]:

    
mult_metrics.precision()









    Out[96]:





0.7007077856420627



In [97]:

    
mult_metrics.recall()









    Out[97]:





0.7007077856420627



In [98]:

    
max(test.label.mean(), 1 - test.label.mean())









    Out[98]:





0.5116279069767442

Cross validation



In [99]:

    
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF

lr = LogisticRegression()
tf = HashingTF(inputCol="tokens", outputCol="features")
pipeline = Pipeline(stages=[tf, lr])



In [100]:

    
pdf = ssc.createDataFrame(df)



In [101]:

    
dataset = sqlContext.createDataFrame(
[(point.features, point.label) for point in df['lpoint']],
["features", "label"])
ptrain = ssc.createDataFrame(train)
ptest = ssc.createDataFrame(test[['tokens','label','lpoint']])



In [102]:

    
model = pipeline.fit(ptrain)



In [103]:

    
prediction = model.transform(ptest)



In [104]:

    
result = prediction.select("tokens", "label", "prediction").toPandas()



In [106]:

    
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [1]:

    
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

cvModel = cv.fit(dataset)
evaluator.evaluate(cvModel.transform(dataset))
type(cvModel)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1310be5ef033> in <module>()
----> 1 grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
      2 evaluator = BinaryClassificationEvaluator()
      3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
      4 
      5 cvModel = cv.fit(dataset)

NameError: name 'ParamGridBuilder' is not defined



In [117]:

    
weights = cvModel.bestModel.weights

Use initial weights of best model



In [128]:

    
lr_new = LogisticRegressionWithSGD.train(train_rdd, initialWeights=weights, iterations=200)
test['pred_new'] = test.apply(lambda row: lr_new.predict(row['lpoint'].features), axis=1)









    



/Users/rene/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app



In [129]:

    
scoreAndLabels = test.apply(lambda row: (float(row['pred_new']), row['lpoint'].label), axis=1)
scoreAndLabels = sc.parallelize(scoreAndLabels)



In [130]:

    
binary_metrics = BinaryClassificationMetrics(scoreAndLabels)



In [131]:

    
binary_metrics.areaUnderROC









    Out[131]:





0.8056182947487296



In [132]:

    
binary_metrics.areaUnderPR









    Out[132]:





0.8613475209275279



In [133]:

    
mult_metrics = MulticlassMetrics(scoreAndLabels)



In [134]:

    
mult_metrics.precision()









    Out[134]:





0.8048533872598584



In [ ]:

	tokens	label
0	[, the case, be the, would be, feel, we had, w...	1
1	[, satu, tapii, kan, cuma, jam sama, handshak,...	0
2	[, want, tattoo, want tattoo, i, tattoo , i want]	0
3	[, maybe, yeah maybe, rawr yeah, rawr, know, k...	0
4	[, ay grabe, grabe, , usernam ay, usernam, ay]	0

	tokens	label	lpoint
0	[, the case, be the, would be, feel, we had, w...	1	(1.0,(3000,[0,79,91,189,241,344,446,656,762,96...
1	[, satu, tapii, kan, cuma, jam sama, handshak,...	0	(0.0,(3000,[0,13,74,203,239,250,256,292,318,51...
2	[, want, tattoo, want tattoo, i, tattoo , i want]	0	(0.0,(3000,[0,186,438,1424,1903,2185,2622],[1....
3	[, maybe, yeah maybe, rawr yeah, rawr, know, k...	0	(0.0,(3000,[0,116,437,675,1142,1216,1249,1465,...
4	[, ay grabe, grabe, , usernam ay, usernam, ay]	0	(0.0,(3000,[0,960,1216,1321,1709,1833,2954],[1...
5	[, aqui, calor aqui, sempr, tem, calor, aqui s...	0	(0.0,(3000,[0,269,276,889,1216,1635,1774,2460,...
6	[, usernam usernam, want, him, i, same, i want...	0	(0.0,(3000,[0,254,438,615,695,725,782,1125,121...
7	[, 1990 an, an waktu, masih, ngetwitnya, ngetw...	0	(0.0,(3000,[0,11,69,138,252,291,480,584,865,95...
8	[, usernam cant, cant , usernam, cant]	0	(0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
9	[]	0	(0.0,(3000,[0],[1.0]))
10	[, gigih cari, cari kat, googl, pink , kat, su...	0	(0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
11	[999 httpst, fifa, , usernam 999, httpstco4hpv...	0	(0.0,(3000,[0,317,452,1106,1216,1321,1375,1448...
12	[, realiz im, httpstcoevokfo3e1, oh, god, my g...	0	(0.0,(3000,[0,12,138,271,452,858,905,916,951,1...
13	[want to, , so i, adult thing, dai off, have, ...	0	(0.0,(3000,[0,75,133,229,438,542,738,916,967,1...
14	[usernam, , usernam , ]	0	(0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
15	[down, , in, sad , slfl, slfl, yall dont, die...	0	(0.0,(3000,[0,359,509,652,785,851,906,1037,105...
16	[, slow, theyr, but, realli slow, but theyr, ...	0	(0.0,(3000,[0,113,487,712,755,1321,1513,1846,1...
17	[, sleep, i, sleep , i cant, cant sleep, cant]	0	(0.0,(3000,[0,588,901,1424,1683,2265,2514],[1....
18	[, ]	0	(0.0,(3000,[0,1321],[1.0,1.0]))
19	[lo lo, , siap mati, hati lo, lo, jual hati, m...	0	(0.0,(3000,[0,9,94,135,149,207,258,375,1047,12...
20	[, why, it so, so, it, so nice, out with, hang...	0	(0.0,(3000,[0,229,237,596,693,857,916,1165,119...
21	[, laban, vid, ng, laban ng, shakeys , mai vid...	0	(0.0,(3000,[0,14,289,552,593,669,957,969,1023,...
22	[giggl, , hurt, lol, hurt , dimpl start, giggl...	0	(0.0,(3000,[0,12,25,97,651,727,868,1007,1493,1...
23	[est, liquid, negro, necesita, uno negro, ya, ...	0	(0.0,(3000,[69,249,284,321,331,344,476,494,734...
24	[, ok la, cantik, la aku, usernam sebab, sebab...	0	(0.0,(3000,[0,140,396,403,430,549,702,781,1038...
25	[, coi5pr88hgvv, httpst, , httpstcoi5pr88hgvv]	0	(0.0,(3000,[0,1321,1558,2133,2306],[1.0,1.0,1....
26	[, lg nyambung, usernam twenti, usernam naik, ...	0	(0.0,(3000,[0,26,131,188,705,805,873,1216,1321...
27	[, httpst, emphasi on, open, on, emphasi, coz...	0	(0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
28	[, in, si, httpst, we in, boat, boat si, the ...	0	(0.0,(3000,[0,24,203,254,357,394,501,914,1034,...
29	[, httpstcowgd2rzvtbj, httpst, cowgd2rzvtbj]	0	(0.0,(3000,[0,428,2299,2306],[1.0,1.0,1.0,1.0]))
...	...	...	...
4973	[corner , , good, a dutch, usernam we, it, a, ...	1	(1.0,(3000,[0,344,595,762,881,937,1165,1216,14...
4974	[newsjunkieswmo, , children, rowl sai, jk rowl...	1	(1.0,(3000,[0,77,99,221,307,862,886,991,1041,1...
4975	[close masih, , masih, sini, join, good, yadon...	1	(1.0,(3000,[0,291,536,579,695,762,777,836,1052...
4976	[, for, is, rest, no kathmandu, us, nepal, of ...	1	(1.0,(3000,[0,24,214,479,662,692,939,967,1015,...
4977	[colleenstaver wititud, , colleenstaver, coaip...	1	(1.0,(3000,[0,109,239,245,626,862,886,1514,158...
4978	[biglaa 3, please, , kita biglaa, 3 , soon, ht...	1	(1.0,(3000,[0,106,200,209,221,714,755,764,834,...
4979	[, for, hour displai, displai pictur, on a, my...	1	(1.0,(3000,[0,12,148,344,593,686,720,730,760,8...
4980	[, watch, otwolmanilainlov, rt, pushawardsjad...	1	(1.0,(3000,[0,905,925,1030,1126,1186,1248,1321...
4981	[, mac, ani nativ, on an, hard, submiss, be tr...	1	(1.0,(3000,[0,12,33,67,105,189,231,341,420,439...
4982	[httpstcowygktqb0ap, tara, , kayo magsit, cavi...	1	(1.0,(3000,[0,329,341,440,452,534,747,807,922,...
4983	[wow, , thank you, awesom, usernam wow, good,...	1	(1.0,(3000,[0,94,551,578,613,631,665,762,780,1...
4984	[for the, , for, follow usernam, insight, http...	1	(1.0,(3000,[0,44,125,145,180,344,542,613,731,8...
4985	[, support, children, palestinian, our un2opt,...	1	(1.0,(3000,[0,214,219,383,533,577,651,705,862,...
4986	[, he so, usernam he, lucki , xx, good, so, lu...	1	(1.0,(3000,[0,159,514,678,762,800,891,916,1216...
4987	[, usernam usernam, hello , job, youth, youth ...	1	(1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
4988	[, so beauti, interview, tv interview, so, ja...	1	(1.0,(3000,[0,28,194,287,759,799,916,965,1207,...
4989	[, awkward, glitch, hilari, do you, good, do, ...	1	(1.0,(3000,[0,42,455,525,681,732,762,952,959,1...
4990	[, jourdanjai, usernam jourdanjai, usernam, jo...	1	(1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
4991	[e, , e publish, will tweet, is, thanks, tweet...	1	(1.0,(3000,[0,116,150,289,388,528,532,782,826,...
4992	[, wala po, usernam wala, ata, at ica, po ata,...	1	(1.0,(3000,[0,156,319,341,736,957,1216,1237,14...
4993	[, kakak, birthdai today, , kakak birthdai, t...	1	(1.0,(3000,[0,102,950,1208,1321,2011,2711,2890...
4994	[please, , your, make up, mind please, up, ple...	1	(1.0,(3000,[0,213,374,581,613,684,906,1566,181...
4995	[, name, forgot it, thank you, your, it thank,...	1	(1.0,(3000,[0,10,176,200,285,306,452,581,596,6...
4996	[, next , what come, what, you know, httpstcow...	1	(1.0,(3000,[0,576,652,1167,1321,1749,1770,1818...
4997	[ where, , great time, had, usernam sound, a, ...	1	(1.0,(3000,[0,91,344,663,785,903,976,1090,1153...
4998	[, ar conceiv, conceiv, realli hot, children, ...	1	(1.0,(3000,[0,101,133,271,487,518,562,682,838,...
4999	[, usernam usernam, usernam me, it, me, me lik...	1	(1.0,(3000,[0,876,1165,1216,1406,1779,1829,203...
5000	[, for, in, follow us, pleas keep, touch, us u...	1	(1.0,(3000,[0,176,180,214,273,441,519,546,613,...
5001	[, read, great wednesday, insight, have, your,...	1	(1.0,(3000,[0,78,100,145,344,542,581,735,784,8...
5002	[, usernam parceli, brilliant mate, parceli us...	1	(1.0,(3000,[0,168,600,888,1216,1255,1554,2764,...

Preprocessing

Using HashingTF, a simple bag of words model

Distribution of labels of training set

Distribution of labels of test set

Run PCA

Train a Logistic Regression classifier

Test

Metrics

Cross validation

Use initial weights of best model

Using `HashingTF`, a simple bag of words model