In [65]:
sc


Out[65]:
<pyspark.context.SparkContext at 0x10c7d10b8>

In [66]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [67]:
df = sqlContext.read.parquet("tweets.consolidated.parquet")

In [68]:
df.show()


+----------+------------------+--------------------+--------------------+--------+
|      user|                id|                text|            location|hasMedia|
+----------+------------------+--------------------+--------------------+--------+
| 429803867|668129332066459648|e0b40f2381c430f6d...|[27.166142,73.852...|   false|
|2575662781|668129436932415488|:) https://t.co/r...|[19.5371016,-96.9...|    true|
|2558754024|668128681945092096|برد 😊 (@ miral -...|[29.10425394,48.1...|   false|
| 175196235|668128627406610432|christmas market:...|[43.6506691,-79.3...|   false|
| 737480838|668128627394019328|يا عزيزي يالمدريد...|[26.21390031,50.4...|   false|
|  22921151|668129030068166657|#noelgeek #ghostb...|[45.50757496,-73....|   false|
|  93448793|668129332041265152|Soooooo these #ne...|[38.72750195,-90....|   false|
| 959736212|668128937801682945|Green Turtle in W...|[39.5640488,-76.9...|   false|
|  59972446|668129025890455552|#Retail #Job in #...|[41.4517093,-82.0...|   false|
|3234610719|668129269160222720|#StaracArabia
الن...|[30.0960606,31.33...|   false|
|2329037172|668128677763379201|Açlık oyunları al...|[38.33868221,27.1...|   false|
|  86583009|668128749032902656|#beaurivagegolf #...|[34.11432705,-77....|   false|
| 569410380|668129231386333185|@bm0406 @ionacrv ...|[19.09500403,72.8...|   false|
|2781520319|668129369819357184|#Bilinmezlik @ İz...|[38.29360749,27.1...|   false|
| 110213197|668128673560592384|Razón tenía aquel...|[9.91077394,-84.0...|   false|
|1179981192|668129055238062081|349.336 personas ...|   [40.4203,-3.7058]|   false|
| 156122032|668128820365512704|🎉🎉🎉 @ Quilmes,...| [-34.7203,-58.2694]|   false|
|  27737029|668129436898885633|Risottinho de moq...|[-22.97027778,-43...|   false|
| 245111438|668128317044826113|fish bowl fridays...|[42.09727335,-75....|   false|
| 543604821|668128518354698242|Viendo el partido...|[19.39476248,-99....|   false|
+----------+------------------+--------------------+--------------------+--------+
only showing top 20 rows


In [69]:
df.printSchema()


root
 |-- user: long (nullable = true)
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- hasMedia: boolean (nullable = true)


In [70]:
textDf = df.select("text")
textDf.show()
textDf.count()


+--------------------+
|                text|
+--------------------+
|e0b40f2381c430f6d...|
|:) https://t.co/r...|
|برد 😊 (@ miral -...|
|christmas market:...|
|يا عزيزي يالمدريد...|
|#noelgeek #ghostb...|
|Soooooo these #ne...|
|Green Turtle in W...|
|#Retail #Job in #...|
|#StaracArabia
الن...|
|Açlık oyunları al...|
|#beaurivagegolf #...|
|@bm0406 @ionacrv ...|
|#Bilinmezlik @ İz...|
|Razón tenía aquel...|
|349.336 personas ...|
|🎉🎉🎉 @ Quilmes,...|
|Risottinho de moq...|
|fish bowl fridays...|
|Viendo el partido...|
+--------------------+
only showing top 20 rows

Out[70]:
2085

In [71]:
df.describe().show()


+-------+--------------------+--------------------+
|summary|                user|                  id|
+-------+--------------------+--------------------+
|  count|                2085|                2085|
|   mean| 8.986858830489209E8|6.679669002385806...|
| stddev|2.3303647371227983E8|                 NaN|
|    min|             1629241|  667127803561844741|
|    max|          4322320759|  668569637496340480|
+-------+--------------------+--------------------+


In [72]:
df.groupBy("hasMedia").count().show()


+--------+-----+
|hasMedia|count|
+--------+-----+
|    true|  118|
|   false| 1967|
+--------+-----+


In [73]:
words = textDf.flatMap(lambda row: row.text.split(" "))

In [74]:
from pyspark.sql import Row
words_df = words.map(lambda w: Row(word=w, cnt=1)).toDF()

In [75]:
word_counts = words_df.groupBy("word").sum()

In [76]:
word_counts.show()


+--------------------+--------+
|                word|sum(cnt)|
+--------------------+--------+
|          #livemusic|       1|
|                  MS|       2|
|               check|       7|
|           PORTLAND,|       1|
|          Beylikdüzü|       2|
|                Club|       7|
|              🍸🍹🍷|       1|
|              casar,|       1|
|           #ig_cuneo|       1|
|                 7°C|       1|
|                foot|       1|
|        Platforms...|       1|
|                Alto|       1|
|               Sweet|       2|
|              Porto,|       1|
|                  мы|       2|
|https://t.co/a8Da...|       1|
|          disfrutar…|       1|
|            Uvarovo,|       1|
|https://t.co/UfuV...|       1|
+--------------------+--------+
only showing top 20 rows


In [77]:
words_df.registerTempTable("words")

In [78]:
sqlContext.sql("SELECT * FROM words").show()


+---+--------------------+
|cnt|                word|
+---+--------------------+
|  1|e0b40f2381c430f6d...|
|  1|                  :)|
|  1|https://t.co/rIRY...|
|  1|                 برد|
|  1|                  😊|
|  1|                  (@|
|  1|               miral|
|  1|                   -|
|  1|               ميرال|
|  1|                  in|
|  1|             Kuwait)|
|  1|https://t.co/YfNO...|
|  1|           christmas|
|  1|             market:|
|  1|                that|
|  1|                time|
|  1|                  of|
|  1|                 the|
|  1|                year|
|  1|               again|
+---+--------------------+
only showing top 20 rows


In [79]:
sqlContext.sql("""
SELECT * FROM words
""").show()


+---+--------------------+
|cnt|                word|
+---+--------------------+
|  1|e0b40f2381c430f6d...|
|  1|                  :)|
|  1|https://t.co/rIRY...|
|  1|                 برد|
|  1|                  😊|
|  1|                  (@|
|  1|               miral|
|  1|                   -|
|  1|               ميرال|
|  1|                  in|
|  1|             Kuwait)|
|  1|https://t.co/YfNO...|
|  1|           christmas|
|  1|             market:|
|  1|                that|
|  1|                time|
|  1|                  of|
|  1|                 the|
|  1|                year|
|  1|               again|
+---+--------------------+
only showing top 20 rows


In [80]:
sqlContext.sql("""
SELECT word,sum(cnt)
FROM words
GROUP BY word
""").show()


+--------------------+---+
|                word|_c1|
+--------------------+---+
|          #livemusic|  1|
|                  MS|  2|
|               check|  7|
|           PORTLAND,|  1|
|          Beylikdüzü|  2|
|                Club|  7|
|              🍸🍹🍷|  1|
|              casar,|  1|
|           #ig_cuneo|  1|
|                 7°C|  1|
|                foot|  1|
|        Platforms...|  1|
|                Alto|  1|
|               Sweet|  2|
|              Porto,|  1|
|                  мы|  2|
|https://t.co/a8Da...|  1|
|          disfrutar…|  1|
|            Uvarovo,|  1|
|https://t.co/UfuV...|  1|
+--------------------+---+
only showing top 20 rows


In [81]:
sqlContext.sql("""
SELECT word,sum(cnt) as c
FROM words
GROUP BY word
ORDER BY c DESC
""").show()


+----------+---+
|      word|  c|
+----------+---+
|        at|514|
|        in|499|
|         @|401|
|       I'm|344|
|         -|232|
|         —|184|
|         a|180|
|          |153|
|        (@|143|
|       the|124|
|       and|118|
|       for|117|
|   #Hiring|112|
|        de|108|
|        to|106|
|        w/| 97|
|       (at| 79|
|       you| 74|
|#CareerArc| 72|
|     #Jobs| 72|
+----------+---+
only showing top 20 rows


In [85]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row

In [86]:
def boolToInt(val):
    if val:
        return 1.0
    else:
        return 0.0

ml_df = sqlContext.createDataFrame(df.map(lambda r : Row(id=r.id, text=r.text, label=boolToInt(r.hasMedia))).collect())

In [87]:
ml_df.show()


+------------------+-----+--------------------+
|                id|label|                text|
+------------------+-----+--------------------+
|668129332066459648|  0.0|e0b40f2381c430f6d...|
|668129436932415488|  1.0|:) https://t.co/r...|
|668128681945092096|  0.0|برد 😊 (@ miral -...|
|668128627406610432|  0.0|christmas market:...|
|668128627394019328|  0.0|يا عزيزي يالمدريد...|
|668129030068166657|  0.0|#noelgeek #ghostb...|
|668129332041265152|  0.0|Soooooo these #ne...|
|668128937801682945|  0.0|Green Turtle in W...|
|668129025890455552|  0.0|#Retail #Job in #...|
|668129269160222720|  0.0|#StaracArabia
الن...|
|668128677763379201|  0.0|Açlık oyunları al...|
|668128749032902656|  0.0|#beaurivagegolf #...|
|668129231386333185|  0.0|@bm0406 @ionacrv ...|
|668129369819357184|  0.0|#Bilinmezlik @ İz...|
|668128673560592384|  0.0|Razón tenía aquel...|
|668129055238062081|  0.0|349.336 personas ...|
|668128820365512704|  0.0|🎉🎉🎉 @ Quilmes,...|
|668129436898885633|  0.0|Risottinho de moq...|
|668128317044826113|  0.0|fish bowl fridays...|
|668128518354698242|  0.0|Viendo el partido...|
+------------------+-----+--------------------+
only showing top 20 rows


In [88]:
ml_df.printSchema()


root
 |-- id: long (nullable = true)
 |-- label: double (nullable = true)
 |-- text: string (nullable = true)


In [89]:
training, test = ml_df.randomSplit((0.9, 0.1), seed = 1)

In [90]:
training.show()


+------------------+-----+--------------------+
|                id|label|                text|
+------------------+-----+--------------------+
|668129332066459648|  0.0|e0b40f2381c430f6d...|
|668128681945092096|  0.0|برد 😊 (@ miral -...|
|668128627406610432|  0.0|christmas market:...|
|668129030068166657|  0.0|#noelgeek #ghostb...|
|668128937801682945|  0.0|Green Turtle in W...|
|668129025890455552|  0.0|#Retail #Job in #...|
|668129269160222720|  0.0|#StaracArabia
الن...|
|668128749032902656|  0.0|#beaurivagegolf #...|
|668129231386333185|  0.0|@bm0406 @ionacrv ...|
|668129369819357184|  0.0|#Bilinmezlik @ İz...|
|668128673560592384|  0.0|Razón tenía aquel...|
|668129055238062081|  0.0|349.336 personas ...|
|668128820365512704|  0.0|🎉🎉🎉 @ Quilmes,...|
|668129436898885633|  0.0|Risottinho de moq...|
|668128317044826113|  0.0|fish bowl fridays...|
|668128518354698242|  0.0|Viendo el partido...|
|667706185517240320|  0.0|Catch The Sooo Se...|
|667705011103621120|  0.0|We're #hiring! Cl...|
|667705933833834496|  0.0|Old friends thru ...|
|667705933825515520|  0.0|I'm at Aydın Fen ...|
+------------------+-----+--------------------+
only showing top 20 rows


In [91]:
test.show()


+------------------+-----+--------------------+
|                id|label|                text|
+------------------+-----+--------------------+
|668129436932415488|  1.0|:) https://t.co/r...|
|668128627394019328|  0.0|يا عزيزي يالمدريد...|
|668129332041265152|  0.0|Soooooo these #ne...|
|668128677763379201|  0.0|Açlık oyunları al...|
|667705069832306688|  0.0|@PenyukaAnisa wau...|
|668130883946188800|  1.0|See a virtual tou...|
|668131290781081600|  1.0|HAPPY BIRTHDAY BB...|
|668131672475492352|  0.0|I'm at CineRitz f...|
|668131018172317697|  0.0|TRAFFIC STOP at S...|
|668131919943458817|  0.0|SLS AMG///  😍 @ ...|
|668126962271911936|  0.0|Seng ulang tahunn...|
|668127545284497408|  0.0|@lndsm101 kkkkk n...|
|668380373722660864|  0.0|Mari makan 😋 (at...|
|668380608616382464|  0.0|I'm at happy trai...|
|668380835104489472|  0.0|Hehe..thx for tod...|
|668130045106475010|  0.0|Ver la @premierle...|
|668130430961479680|  0.0|• TIME ♡ FLIES •
...|
|668052060399603713|  0.0|Хорошие зонтики, ...|
|668052203018522625|  0.0|See ya omaha. #je...|
|668052882478927872|  0.0|Nyam nyam (at @yu...|
+------------------+-----+--------------------+
only showing top 20 rows


In [92]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [93]:
model = pipeline.fit(training)

In [94]:
prediction = model.transform(test)
selected = prediction.select("id", "text", "label", "prediction")

In [95]:
selected.show()


+------------------+--------------------+-----+----------+
|                id|                text|label|prediction|
+------------------+--------------------+-----+----------+
|668129436932415488|:) https://t.co/r...|  1.0|       0.0|
|668128627394019328|يا عزيزي يالمدريد...|  0.0|       0.0|
|668129332041265152|Soooooo these #ne...|  0.0|       0.0|
|668128677763379201|Açlık oyunları al...|  0.0|       0.0|
|667705069832306688|@PenyukaAnisa wau...|  0.0|       0.0|
|668130883946188800|See a virtual tou...|  1.0|       0.0|
|668131290781081600|HAPPY BIRTHDAY BB...|  1.0|       0.0|
|668131672475492352|I'm at CineRitz f...|  0.0|       0.0|
|668131018172317697|TRAFFIC STOP at S...|  0.0|       0.0|
|668131919943458817|SLS AMG///  😍 @ ...|  0.0|       0.0|
|668126962271911936|Seng ulang tahunn...|  0.0|       0.0|
|668127545284497408|@lndsm101 kkkkk n...|  0.0|       0.0|
|668380373722660864|Mari makan 😋 (at...|  0.0|       0.0|
|668380608616382464|I'm at happy trai...|  0.0|       0.0|
|668380835104489472|Hehe..thx for tod...|  0.0|       0.0|
|668130045106475010|Ver la @premierle...|  0.0|       0.0|
|668130430961479680|• TIME ♡ FLIES •
...|  0.0|       0.0|
|668052060399603713|Хорошие зонтики, ...|  0.0|       0.0|
|668052203018522625|See ya omaha. #je...|  0.0|       0.0|
|668052882478927872|Nyam nyam (at @yu...|  0.0|       0.0|
+------------------+--------------------+-----+----------+
only showing top 20 rows


In [ ]: