In [68]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [69]:
%matplotlib inline

In [70]:
ssc = SQLContext(sc)
tweets = ssc.read.parquet("/tmp/tweet-corpus")
tweets.cache()


Out[70]:
DataFrame[_1: array<string>, _2: double]

In [71]:
tweets.count()


Out[71]:
5003

In [72]:
df = tweets.toPandas()
df.columns = ["tokens", "label"]
df.head()


Out[72]:
tokens label
0 [, the case, be the, would be, feel, we had, w... 1
1 [, satu, tapii, kan, cuma, jam sama, handshak,... 0
2 [, want, tattoo, want tattoo, i, tattoo , i want] 0
3 [, maybe, yeah maybe, rawr yeah, rawr, know, k... 0
4 [, ay grabe, grabe, , usernam ay, usernam, ay] 0

Preprocessing

Using HashingTF, a simple bag of words model


In [73]:
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import LabeledPoint

In [74]:
coeff = 3000
hashingTf = HashingTF(coeff)

In [75]:
#vectors = sc.parallelize([hashingTf.transform(tokens) for tokens in df._1])
#idf = IDF().fit(vectors)

In [76]:
def featurize(tokens): return hashingTf.transform(tokens)
#def tfidf(tokens): return idf.transform(tf(tokens))

In [77]:
df['lpoint'] = df.apply(lambda row: LabeledPoint(row['label'], featurize(row['tokens'])), axis=1)
df


Out[77]:
tokens label lpoint
0 [, the case, be the, would be, feel, we had, w... 1 (1.0,(3000,[0,79,91,189,241,344,446,656,762,96...
1 [, satu, tapii, kan, cuma, jam sama, handshak,... 0 (0.0,(3000,[0,13,74,203,239,250,256,292,318,51...
2 [, want, tattoo, want tattoo, i, tattoo , i want] 0 (0.0,(3000,[0,186,438,1424,1903,2185,2622],[1....
3 [, maybe, yeah maybe, rawr yeah, rawr, know, k... 0 (0.0,(3000,[0,116,437,675,1142,1216,1249,1465,...
4 [, ay grabe, grabe, , usernam ay, usernam, ay] 0 (0.0,(3000,[0,960,1216,1321,1709,1833,2954],[1...
5 [, aqui, calor aqui, sempr, tem, calor, aqui s... 0 (0.0,(3000,[0,269,276,889,1216,1635,1774,2460,...
6 [, usernam usernam, want, him, i, same, i want... 0 (0.0,(3000,[0,254,438,615,695,725,782,1125,121...
7 [, 1990 an, an waktu, masih, ngetwitnya, ngetw... 0 (0.0,(3000,[0,11,69,138,252,291,480,584,865,95...
8 [, usernam cant, cant , usernam, cant] 0 (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
9 [] 0 (0.0,(3000,[0],[1.0]))
10 [, gigih cari, cari kat, googl, pink , kat, su... 0 (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
11 [999 httpst, fifa, , usernam 999, httpstco4hpv... 0 (0.0,(3000,[0,317,452,1106,1216,1321,1375,1448...
12 [, realiz im, httpstcoevokfo3e1, oh, god, my g... 0 (0.0,(3000,[0,12,138,271,452,858,905,916,951,1...
13 [want to, , so i, adult thing, dai off, have, ... 0 (0.0,(3000,[0,75,133,229,438,542,738,916,967,1...
14 [usernam, , usernam , ] 0 (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
15 [down, , in, sad , slfl, slfl, yall dont, die... 0 (0.0,(3000,[0,359,509,652,785,851,906,1037,105...
16 [, slow, theyr, but, realli slow, but theyr, ... 0 (0.0,(3000,[0,113,487,712,755,1321,1513,1846,1...
17 [, sleep, i, sleep , i cant, cant sleep, cant] 0 (0.0,(3000,[0,588,901,1424,1683,2265,2514],[1....
18 [, ] 0 (0.0,(3000,[0,1321],[1.0,1.0]))
19 [lo lo, , siap mati, hati lo, lo, jual hati, m... 0 (0.0,(3000,[0,9,94,135,149,207,258,375,1047,12...
20 [, why, it so, so, it, so nice, out with, hang... 0 (0.0,(3000,[0,229,237,596,693,857,916,1165,119...
21 [, laban, vid, ng, laban ng, shakeys , mai vid... 0 (0.0,(3000,[0,14,289,552,593,669,957,969,1023,...
22 [giggl, , hurt, lol, hurt , dimpl start, giggl... 0 (0.0,(3000,[0,12,25,97,651,727,868,1007,1493,1...
23 [est, liquid, negro, necesita, uno negro, ya, ... 0 (0.0,(3000,[69,249,284,321,331,344,476,494,734...
24 [, ok la, cantik, la aku, usernam sebab, sebab... 0 (0.0,(3000,[0,140,396,403,430,549,702,781,1038...
25 [, coi5pr88hgvv, httpst, , httpstcoi5pr88hgvv] 0 (0.0,(3000,[0,1321,1558,2133,2306],[1.0,1.0,1....
26 [, lg nyambung, usernam twenti, usernam naik, ... 0 (0.0,(3000,[0,26,131,188,705,805,873,1216,1321...
27 [, httpst, emphasi on, open, on, emphasi, coz... 0 (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
28 [, in, si, httpst, we in, boat, boat si, the ... 0 (0.0,(3000,[0,24,203,254,357,394,501,914,1034,...
29 [, httpstcowgd2rzvtbj, httpst, cowgd2rzvtbj] 0 (0.0,(3000,[0,428,2299,2306],[1.0,1.0,1.0,1.0]))
... ... ... ...
4973 [corner , , good, a dutch, usernam we, it, a, ... 1 (1.0,(3000,[0,344,595,762,881,937,1165,1216,14...
4974 [newsjunkieswmo, , children, rowl sai, jk rowl... 1 (1.0,(3000,[0,77,99,221,307,862,886,991,1041,1...
4975 [close masih, , masih, sini, join, good, yadon... 1 (1.0,(3000,[0,291,536,579,695,762,777,836,1052...
4976 [, for, is, rest, no kathmandu, us, nepal, of ... 1 (1.0,(3000,[0,24,214,479,662,692,939,967,1015,...
4977 [colleenstaver wititud, , colleenstaver, coaip... 1 (1.0,(3000,[0,109,239,245,626,862,886,1514,158...
4978 [biglaa 3, please, , kita biglaa, 3 , soon, ht... 1 (1.0,(3000,[0,106,200,209,221,714,755,764,834,...
4979 [, for, hour displai, displai pictur, on a, my... 1 (1.0,(3000,[0,12,148,344,593,686,720,730,760,8...
4980 [, watch, otwolmanilainlov, rt, pushawardsjad... 1 (1.0,(3000,[0,905,925,1030,1126,1186,1248,1321...
4981 [, mac, ani nativ, on an, hard, submiss, be tr... 1 (1.0,(3000,[0,12,33,67,105,189,231,341,420,439...
4982 [httpstcowygktqb0ap, tara, , kayo magsit, cavi... 1 (1.0,(3000,[0,329,341,440,452,534,747,807,922,...
4983 [wow, , thank you, awesom, usernam wow, good,... 1 (1.0,(3000,[0,94,551,578,613,631,665,762,780,1...
4984 [for the, , for, follow usernam, insight, http... 1 (1.0,(3000,[0,44,125,145,180,344,542,613,731,8...
4985 [, support, children, palestinian, our un2opt,... 1 (1.0,(3000,[0,214,219,383,533,577,651,705,862,...
4986 [, he so, usernam he, lucki , xx, good, so, lu... 1 (1.0,(3000,[0,159,514,678,762,800,891,916,1216...
4987 [, usernam usernam, hello , job, youth, youth ... 1 (1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
4988 [, so beauti, interview, tv interview, so, ja... 1 (1.0,(3000,[0,28,194,287,759,799,916,965,1207,...
4989 [, awkward, glitch, hilari, do you, good, do, ... 1 (1.0,(3000,[0,42,455,525,681,732,762,952,959,1...
4990 [, jourdanjai, usernam jourdanjai, usernam, jo... 1 (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
4991 [e, , e publish, will tweet, is, thanks, tweet... 1 (1.0,(3000,[0,116,150,289,388,528,532,782,826,...
4992 [, wala po, usernam wala, ata, at ica, po ata,... 1 (1.0,(3000,[0,156,319,341,736,957,1216,1237,14...
4993 [, kakak, birthdai today, , kakak birthdai, t... 1 (1.0,(3000,[0,102,950,1208,1321,2011,2711,2890...
4994 [please, , your, make up, mind please, up, ple... 1 (1.0,(3000,[0,213,374,581,613,684,906,1566,181...
4995 [, name, forgot it, thank you, your, it thank,... 1 (1.0,(3000,[0,10,176,200,285,306,452,581,596,6...
4996 [, next , what come, what, you know, httpstcow... 1 (1.0,(3000,[0,576,652,1167,1321,1749,1770,1818...
4997 [ where, , great time, had, usernam sound, a, ... 1 (1.0,(3000,[0,91,344,663,785,903,976,1090,1153...
4998 [, ar conceiv, conceiv, realli hot, children, ... 1 (1.0,(3000,[0,101,133,271,487,518,562,682,838,...
4999 [, usernam usernam, usernam me, it, me, me lik... 1 (1.0,(3000,[0,876,1165,1216,1406,1779,1829,203...
5000 [, for, in, follow us, pleas keep, touch, us u... 1 (1.0,(3000,[0,176,180,214,273,441,519,546,613,...
5001 [, read, great wednesday, insight, have, your,... 1 (1.0,(3000,[0,78,100,145,344,542,581,735,784,8...
5002 [, usernam parceli, brilliant mate, parceli us... 1 (1.0,(3000,[0,168,600,888,1216,1255,1554,2764,...

5003 rows × 3 columns

Create train/test split


In [78]:
# create boolean mask
msk = np.random.rand(len(df)) < 0.80
train = df[msk]
test = df[~msk]

Distribution of labels of training set


In [79]:
_ = sns.countplot(x="label", data=train)


Distribution of labels of test set


In [80]:
_ = sns.countplot(x="label", data=test)


Run PCA


In [81]:
from pyspark.mllib.feature import PCA

In [82]:
df


Out[82]:
tokens label lpoint
0 [, the case, be the, would be, feel, we had, w... 1 (1.0,(3000,[0,79,91,189,241,344,446,656,762,96...
1 [, satu, tapii, kan, cuma, jam sama, handshak,... 0 (0.0,(3000,[0,13,74,203,239,250,256,292,318,51...
2 [, want, tattoo, want tattoo, i, tattoo , i want] 0 (0.0,(3000,[0,186,438,1424,1903,2185,2622],[1....
3 [, maybe, yeah maybe, rawr yeah, rawr, know, k... 0 (0.0,(3000,[0,116,437,675,1142,1216,1249,1465,...
4 [, ay grabe, grabe, , usernam ay, usernam, ay] 0 (0.0,(3000,[0,960,1216,1321,1709,1833,2954],[1...
5 [, aqui, calor aqui, sempr, tem, calor, aqui s... 0 (0.0,(3000,[0,269,276,889,1216,1635,1774,2460,...
6 [, usernam usernam, want, him, i, same, i want... 0 (0.0,(3000,[0,254,438,615,695,725,782,1125,121...
7 [, 1990 an, an waktu, masih, ngetwitnya, ngetw... 0 (0.0,(3000,[0,11,69,138,252,291,480,584,865,95...
8 [, usernam cant, cant , usernam, cant] 0 (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1....
9 [] 0 (0.0,(3000,[0],[1.0]))
10 [, gigih cari, cari kat, googl, pink , kat, su... 0 (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373...
11 [999 httpst, fifa, , usernam 999, httpstco4hpv... 0 (0.0,(3000,[0,317,452,1106,1216,1321,1375,1448...
12 [, realiz im, httpstcoevokfo3e1, oh, god, my g... 0 (0.0,(3000,[0,12,138,271,452,858,905,916,951,1...
13 [want to, , so i, adult thing, dai off, have, ... 0 (0.0,(3000,[0,75,133,229,438,542,738,916,967,1...
14 [usernam, , usernam , ] 0 (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0]))
15 [down, , in, sad , slfl, slfl, yall dont, die... 0 (0.0,(3000,[0,359,509,652,785,851,906,1037,105...
16 [, slow, theyr, but, realli slow, but theyr, ... 0 (0.0,(3000,[0,113,487,712,755,1321,1513,1846,1...
17 [, sleep, i, sleep , i cant, cant sleep, cant] 0 (0.0,(3000,[0,588,901,1424,1683,2265,2514],[1....
18 [, ] 0 (0.0,(3000,[0,1321],[1.0,1.0]))
19 [lo lo, , siap mati, hati lo, lo, jual hati, m... 0 (0.0,(3000,[0,9,94,135,149,207,258,375,1047,12...
20 [, why, it so, so, it, so nice, out with, hang... 0 (0.0,(3000,[0,229,237,596,693,857,916,1165,119...
21 [, laban, vid, ng, laban ng, shakeys , mai vid... 0 (0.0,(3000,[0,14,289,552,593,669,957,969,1023,...
22 [giggl, , hurt, lol, hurt , dimpl start, giggl... 0 (0.0,(3000,[0,12,25,97,651,727,868,1007,1493,1...
23 [est, liquid, negro, necesita, uno negro, ya, ... 0 (0.0,(3000,[69,249,284,321,331,344,476,494,734...
24 [, ok la, cantik, la aku, usernam sebab, sebab... 0 (0.0,(3000,[0,140,396,403,430,549,702,781,1038...
25 [, coi5pr88hgvv, httpst, , httpstcoi5pr88hgvv] 0 (0.0,(3000,[0,1321,1558,2133,2306],[1.0,1.0,1....
26 [, lg nyambung, usernam twenti, usernam naik, ... 0 (0.0,(3000,[0,26,131,188,705,805,873,1216,1321...
27 [, httpst, emphasi on, open, on, emphasi, coz... 0 (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23...
28 [, in, si, httpst, we in, boat, boat si, the ... 0 (0.0,(3000,[0,24,203,254,357,394,501,914,1034,...
29 [, httpstcowgd2rzvtbj, httpst, cowgd2rzvtbj] 0 (0.0,(3000,[0,428,2299,2306],[1.0,1.0,1.0,1.0]))
... ... ... ...
4973 [corner , , good, a dutch, usernam we, it, a, ... 1 (1.0,(3000,[0,344,595,762,881,937,1165,1216,14...
4974 [newsjunkieswmo, , children, rowl sai, jk rowl... 1 (1.0,(3000,[0,77,99,221,307,862,886,991,1041,1...
4975 [close masih, , masih, sini, join, good, yadon... 1 (1.0,(3000,[0,291,536,579,695,762,777,836,1052...
4976 [, for, is, rest, no kathmandu, us, nepal, of ... 1 (1.0,(3000,[0,24,214,479,662,692,939,967,1015,...
4977 [colleenstaver wititud, , colleenstaver, coaip... 1 (1.0,(3000,[0,109,239,245,626,862,886,1514,158...
4978 [biglaa 3, please, , kita biglaa, 3 , soon, ht... 1 (1.0,(3000,[0,106,200,209,221,714,755,764,834,...
4979 [, for, hour displai, displai pictur, on a, my... 1 (1.0,(3000,[0,12,148,344,593,686,720,730,760,8...
4980 [, watch, otwolmanilainlov, rt, pushawardsjad... 1 (1.0,(3000,[0,905,925,1030,1126,1186,1248,1321...
4981 [, mac, ani nativ, on an, hard, submiss, be tr... 1 (1.0,(3000,[0,12,33,67,105,189,231,341,420,439...
4982 [httpstcowygktqb0ap, tara, , kayo magsit, cavi... 1 (1.0,(3000,[0,329,341,440,452,534,747,807,922,...
4983 [wow, , thank you, awesom, usernam wow, good,... 1 (1.0,(3000,[0,94,551,578,613,631,665,762,780,1...
4984 [for the, , for, follow usernam, insight, http... 1 (1.0,(3000,[0,44,125,145,180,344,542,613,731,8...
4985 [, support, children, palestinian, our un2opt,... 1 (1.0,(3000,[0,214,219,383,533,577,651,705,862,...
4986 [, he so, usernam he, lucki , xx, good, so, lu... 1 (1.0,(3000,[0,159,514,678,762,800,891,916,1216...
4987 [, usernam usernam, hello , job, youth, youth ... 1 (1.0,(3000,[0,190,496,829,872,945,1089,1175,12...
4988 [, so beauti, interview, tv interview, so, ja... 1 (1.0,(3000,[0,28,194,287,759,799,916,965,1207,...
4989 [, awkward, glitch, hilari, do you, good, do, ... 1 (1.0,(3000,[0,42,455,525,681,732,762,952,959,1...
4990 [, jourdanjai, usernam jourdanjai, usernam, jo... 1 (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1...
4991 [e, , e publish, will tweet, is, thanks, tweet... 1 (1.0,(3000,[0,116,150,289,388,528,532,782,826,...
4992 [, wala po, usernam wala, ata, at ica, po ata,... 1 (1.0,(3000,[0,156,319,341,736,957,1216,1237,14...
4993 [, kakak, birthdai today, , kakak birthdai, t... 1 (1.0,(3000,[0,102,950,1208,1321,2011,2711,2890...
4994 [please, , your, make up, mind please, up, ple... 1 (1.0,(3000,[0,213,374,581,613,684,906,1566,181...
4995 [, name, forgot it, thank you, your, it thank,... 1 (1.0,(3000,[0,10,176,200,285,306,452,581,596,6...
4996 [, next , what come, what, you know, httpstcow... 1 (1.0,(3000,[0,576,652,1167,1321,1749,1770,1818...
4997 [ where, , great time, had, usernam sound, a, ... 1 (1.0,(3000,[0,91,344,663,785,903,976,1090,1153...
4998 [, ar conceiv, conceiv, realli hot, children, ... 1 (1.0,(3000,[0,101,133,271,487,518,562,682,838,...
4999 [, usernam usernam, usernam me, it, me, me lik... 1 (1.0,(3000,[0,876,1165,1216,1406,1779,1829,203...
5000 [, for, in, follow us, pleas keep, touch, us u... 1 (1.0,(3000,[0,176,180,214,273,441,519,546,613,...
5001 [, read, great wednesday, insight, have, your,... 1 (1.0,(3000,[0,78,100,145,344,542,581,735,784,8...
5002 [, usernam parceli, brilliant mate, parceli us... 1 (1.0,(3000,[0,168,600,888,1216,1255,1554,2764,...

5003 rows × 3 columns


In [83]:
#lpoints = df['lpoint']
#rdd = sc.parallelize(lpoints.map(lambda point: point.features).tolist())
#pca = PCA(3).fit(rdd)
#df['pca'] = df.apply(lambda row: pca.transform(row['lpoint'].features), axis=1)
#df['pca_0'] = df.apply(lambda row: row['pca'][0], axis=1)
#df['pca_1'] = df.apply(lambda row: row['pca'][1], axis=1)
#viz = df[['label', 'pca_0', 'pca_1']]

In [84]:
#_ = sns.pairplot(viz, vars=['pca_0', 'pca_1'], hue="label", size=6.0)

Train a Logistic Regression classifier


In [85]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint

Let's add a new column with LabeledPoints consisting of TF-IDF vectors.


In [86]:
train_rdd = sc.parallelize(train.lpoint)

Now train the logistic regression estimator.


In [87]:
lr = LogisticRegressionWithSGD.train(train_rdd, initialWeights=Vectors.zeros(coeff), iterations=200)

Test


In [88]:
test['pred'] = test.apply(lambda row: lr.predict(row['lpoint'].features), axis=1)


/Users/rene/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [89]:
test


Out[89]:
tokens label lpoint pred
8 [, usernam cant, cant , usernam, cant] 0 (0.0,(3000,[0,1216,1767,2514,2691],[1.0,1.0,1.... 0
10 [, gigih cari, cari kat, googl, pink , kat, su... 0 (0.0,(3000,[0,23,96,376,587,849,1319,1350,1373... 0
27 [, httpst, emphasi on, open, on, emphasi, coz... 0 (0.0,(3000,[0,5,625,700,1092,1866,2289,2306,23... 0
41 [want to, , for, art, cant stop, have heart, m... 0 (0.0,(3000,[0,12,25,56,92,191,290,356,438,542,... 0
44 [, feel, person i, feel comfort, talk to, coul... 0 (0.0,(3000,[0,214,307,317,389,432,558,585,608,... 0
46 [saya haru, , saya, lagi, bro, gimana lagi, la... 0 (0.0,(3000,[0,230,583,672,693,781,1050,1244,17... 0
47 [, usernam disgusting, sad , have, she must, j... 0 (0.0,(3000,[0,138,542,681,688,792,916,1216,126... 0
59 [incident, allah maaf, le, , ya, in, sad incid... 0 (0.0,(3000,[0,17,22,210,259,278,441,447,502,90... 0
62 [, dean, httpst, care, coxuqjh2lhvm, take, ca... 0 (0.0,(3000,[0,289,1103,1131,1196,1549,1572,170... 0
64 [, tengo que, me tengo, igual, ltima creo, use... 0 (0.0,(3000,[0,332,655,732,812,839,857,884,949,... 1
82 [, for, sorri, im sorri, sorri for, stand berg... 0 (0.0,(3000,[0,228,621,693,845,875,985,1015,117... 0
90 [, httpstcofa31skmwuf, httpst, , cofa31skmwuf] 0 (0.0,(3000,[0,5,1321,1826,2306],[1.0,1.0,1.0,1... 0
93 [, puta, po ako, bigyan, at, guys, papakatib... 0 (0.0,(3000,[0,64,172,283,329,341,372,434,442,5... 0
97 [, jealou, okai pahal, usernam prihatin, okai,... 0 (0.0,(3000,[0,636,859,868,1063,1161,1170,1216,... 1
99 [, fuck, todai, earli todai, todai , it, earli... 0 (0.0,(3000,[0,86,601,1165,1194,1443,1555,1584,... 0
106 [cokunguc6m2x, , httpstcokunguc6m2x, rin, htt... 0 (0.0,(3000,[0,3,422,1216,1356,1576,1839,1973,2... 0
115 [want to, , chang, want, chang , idk, usernam ... 0 (0.0,(3000,[0,438,1064,1186,1216,1259,1424,190... 0
117 [, nakabog , eh nakabog, nakabog, usernam naka... 0 (0.0,(3000,[0,1055,1198,1216,1590,2509,2539,25... 0
118 [usernam wong, , terbentuk, terbentuk dari, te... 0 (0.0,(3000,[0,445,610,900,931,1216,1485,1586,1... 1
120 [usernam, , usernam , ] 0 (0.0,(3000,[0,317,1216,1321],[1.0,1.0,1.0,1.0])) 1
123 [follow mebtw, niall, your, actual mean, your ... 0 (0.0,(3000,[16,44,175,180,312,404,405,581,785,... 0
126 [naman nyan, aldub16thweeksari httpst, colgtcs... 0 (0.0,(3000,[37,227,284,692,764,851,1017,1129,1... 0
138 [usernam, , usernam ] 0 (0.0,(3000,[0,317,1216],[1.0,1.0,1.0])) 0
141 [, ir , quiero ir, ir, quiero] 0 (0.0,(3000,[0,159,1163,1419,1448],[1.0,1.0,1.0... 0
145 [ur okai, , and feel, feel, soon, hope ur, bet... 0 (0.0,(3000,[0,160,176,200,215,221,317,565,609,... 0
155 [usernam wong, wong httpst, unikernels, have, ... 0 (0.0,(3000,[248,419,542,610,685,817,824,979,11... 1
163 [scare, , win, is, scare me, win a, uk, what, ... 0 (0.0,(3000,[0,56,90,200,222,344,364,474,596,67... 1
171 [, pinapakita, is, parang, ako kc, c meng, pin... 0 (0.0,(3000,[0,260,332,333,341,423,440,460,487,... 0
178 [masih, fav, httpstcou7g2gdbqo7, fav httpst, m... 0 (0.0,(3000,[291,1390,1420,1482,2621,2908],[1.0... 1
183 [, jakarta, kan, usernam azahra, azahra, azahr... 0 (0.0,(3000,[0,8,13,137,209,231,513,536,573,663... 0
... ... ... ... ...
4871 [, for, have, wednesdai , usernam thank, a, ,... 1 (1.0,(3000,[0,180,344,542,613,1090,1153,1166,1... 1
4883 [, usernam you, usernam usernam, you know, goo... 1 (1.0,(3000,[0,762,1165,1216,1406,1749,1882,196... 1
4884 [, for, oh, usernam usernam, that must, must, ... 1 (1.0,(3000,[0,180,242,283,344,418,613,858,979,... 1
4885 [, children, cofpwdvgpyoq, pointluck1, guides... 1 (1.0,(3000,[0,83,262,516,605,862,886,1006,1034... 0
4886 [, have, your, usernam hi, seen, and you, emai... 1 (1.0,(3000,[0,75,147,176,344,542,581,664,742,8... 1
4887 [n g, i n, c h, , s, s , n, h a, i s, a, a r, ... 1 (1.0,(3000,[0,121,266,344,482,552,553,859,1274... 0
4889 [, want, anyth, anyth thei, ship anyth, peopl ... 1 (1.0,(3000,[0,238,278,438,593,746,884,1019,113... 0
4894 [, good, usernam buda, trjan , buda p, buda, p... 1 (1.0,(3000,[0,98,121,488,523,616,762,1216,1858... 1
4897 [, a, i need, rt, i, , need, need a, usernam ... 1 (1.0,(3000,[0,80,129,344,443,1111,1216,1321,13... 0
4903 [, usernam handl, , handl usernam, usernam , ... 1 (1.0,(3000,[0,317,936,1216,1255,1321,1479],[1.... 1
4905 [, nettl sting, usernam usernam, xx, nettl, ,... 1 (1.0,(3000,[0,35,76,262,514,891,1216,1321,1655... 1
4907 [, usernam usernam, hello , job, youth, youth ... 1 (1.0,(3000,[0,190,496,829,872,945,1089,1175,12... 1
4912 [httpstcocs30mrjoik, for the, , for, follow us... 1 (1.0,(3000,[0,125,145,180,344,542,613,662,731,... 1
4913 [, is, cute, so, aww, that, usernam aww, cute ... 1 (1.0,(3000,[0,322,916,1162,1216,1239,1312,1316... 0
4914 [, to connect, usernam usernam, connect, conne... 1 (1.0,(3000,[0,125,613,1112,1134,1216,1656,1937... 1
4924 [, appal, comment on, as a, a, usernam ill, as... 1 (1.0,(3000,[0,12,72,176,198,218,253,327,344,45... 1
4929 [1to, , becom 1to, have, on a, will becom, goo... 1 (1.0,(3000,[0,3,61,249,273,344,528,542,619,762... 1
4932 [, ar conceiv, conceiv, realli hot, children, ... 1 (1.0,(3000,[0,101,133,271,442,487,518,562,565,... 1
4940 [, for, smile , for make, smile, usernam https... 1 (1.0,(3000,[0,61,180,613,906,1130,1216,1315,13... 1
4942 [, usernam crypto, good, at least, so, you at,... 1 (1.0,(3000,[0,597,638,652,692,916,917,957,1006... 1
4947 [, sapn, saro, usernam usernam, but, ne , ms ... 1 (1.0,(3000,[0,6,121,127,241,283,309,327,530,64... 1
4949 [, love it, it, , still love, love, still, us... 1 (1.0,(3000,[0,432,868,1165,1216,1321,1406,1668... 1
4957 [a job, for the, , for, great, see a, austin,... 1 (1.0,(3000,[0,3,31,106,293,344,381,524,528,613... 1
4962 [, men , good, just, men, i just, i, love, jus... 1 (1.0,(3000,[0,271,299,762,924,1424,1495,1668,2... 1
4965 [, for, insight, have, cov01enzzvd1, ne money,... 1 (1.0,(3000,[0,145,180,344,542,613,688,1021,109... 1
4975 [close masih, , masih, sini, join, good, yadon... 1 (1.0,(3000,[0,291,536,579,695,762,777,836,1052... 1
4977 [colleenstaver wititud, , colleenstaver, coaip... 1 (1.0,(3000,[0,109,239,245,626,862,886,1514,158... 1
4985 [, support, children, palestinian, our un2opt,... 1 (1.0,(3000,[0,214,219,383,533,577,651,705,862,... 1
4990 [, jourdanjai, usernam jourdanjai, usernam, jo... 1 (1.0,(3000,[0,285,894,920,1216],[1.0,1.0,1.0,1... 0
4998 [, ar conceiv, conceiv, realli hot, children, ... 1 (1.0,(3000,[0,101,133,271,487,518,562,682,838,... 1

989 rows × 4 columns

Metrics


In [90]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

In [91]:
scoreAndLabels = test.apply(lambda row: (float(row['pred']), row['lpoint'].label), axis=1)
scoreAndLabels = sc.parallelize(scoreAndLabels)

In [92]:
binary_metrics = BinaryClassificationMetrics(scoreAndLabels)

In [93]:
binary_metrics.areaUnderPR


Out[93]:
0.7816544946099755

In [94]:
binary_metrics.areaUnderROC


Out[94]:
0.7012045925089404

In [95]:
mult_metrics = MulticlassMetrics(scoreAndLabels)

In [96]:
mult_metrics.precision()


Out[96]:
0.7007077856420627

In [97]:
mult_metrics.recall()


Out[97]:
0.7007077856420627

In [98]:
max(test.label.mean(), 1 - test.label.mean())


Out[98]:
0.5116279069767442

Cross validation


In [99]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF

lr = LogisticRegression()
tf = HashingTF(inputCol="tokens", outputCol="features")
pipeline = Pipeline(stages=[tf, lr])

In [100]:
pdf = ssc.createDataFrame(df)

In [101]:
dataset = sqlContext.createDataFrame(
[(point.features, point.label) for point in df['lpoint']],
["features", "label"])
ptrain = ssc.createDataFrame(train)
ptest = ssc.createDataFrame(test[['tokens','label','lpoint']])

In [102]:
model = pipeline.fit(ptrain)

In [103]:
prediction = model.transform(ptest)

In [104]:
result = prediction.select("tokens", "label", "prediction").toPandas()

In [106]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [1]:
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

cvModel = cv.fit(dataset)
evaluator.evaluate(cvModel.transform(dataset))
type(cvModel)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1310be5ef033> in <module>()
----> 1 grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
      2 evaluator = BinaryClassificationEvaluator()
      3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
      4 
      5 cvModel = cv.fit(dataset)

NameError: name 'ParamGridBuilder' is not defined

In [117]:
weights = cvModel.bestModel.weights

Use initial weights of best model


In [128]:
lr_new = LogisticRegressionWithSGD.train(train_rdd, initialWeights=weights, iterations=200)
test['pred_new'] = test.apply(lambda row: lr_new.predict(row['lpoint'].features), axis=1)


/Users/rene/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [129]:
scoreAndLabels = test.apply(lambda row: (float(row['pred_new']), row['lpoint'].label), axis=1)
scoreAndLabels = sc.parallelize(scoreAndLabels)

In [130]:
binary_metrics = BinaryClassificationMetrics(scoreAndLabels)

In [131]:
binary_metrics.areaUnderROC


Out[131]:
0.8056182947487296

In [132]:
binary_metrics.areaUnderPR


Out[132]:
0.8613475209275279

In [133]:
mult_metrics = MulticlassMetrics(scoreAndLabels)

In [134]:
mult_metrics.precision()


Out[134]:
0.8048533872598584

In [ ]: