In [1]:
sc


Out[1]:
<pyspark.context.SparkContext at 0x10648f0b8>

In [2]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [3]:
df = sqlContext.read.parquet("tweets.consolidated.parquet")

In [4]:
df.show()


+----------+------------------+--------------------+--------------------+--------+
|      user|                id|                text|            location|hasMedia|
+----------+------------------+--------------------+--------------------+--------+
| 429803867|668129332066459648|e0b40f2381c430f6d...|[27.166142,73.852...|   false|
|2575662781|668129436932415488|:) https://t.co/r...|[19.5371016,-96.9...|    true|
|2558754024|668128681945092096|برد 😊 (@ miral -...|[29.10425394,48.1...|   false|
| 175196235|668128627406610432|christmas market:...|[43.6506691,-79.3...|   false|
| 737480838|668128627394019328|يا عزيزي يالمدريد...|[26.21390031,50.4...|   false|
|  22921151|668129030068166657|#noelgeek #ghostb...|[45.50757496,-73....|   false|
|  93448793|668129332041265152|Soooooo these #ne...|[38.72750195,-90....|   false|
| 959736212|668128937801682945|Green Turtle in W...|[39.5640488,-76.9...|   false|
|  59972446|668129025890455552|#Retail #Job in #...|[41.4517093,-82.0...|   false|
|3234610719|668129269160222720|#StaracArabia
الن...|[30.0960606,31.33...|   false|
|2329037172|668128677763379201|Açlık oyunları al...|[38.33868221,27.1...|   false|
|  86583009|668128749032902656|#beaurivagegolf #...|[34.11432705,-77....|   false|
| 569410380|668129231386333185|@bm0406 @ionacrv ...|[19.09500403,72.8...|   false|
|2781520319|668129369819357184|#Bilinmezlik @ İz...|[38.29360749,27.1...|   false|
| 110213197|668128673560592384|Razón tenía aquel...|[9.91077394,-84.0...|   false|
|1179981192|668129055238062081|349.336 personas ...|   [40.4203,-3.7058]|   false|
| 156122032|668128820365512704|🎉🎉🎉 @ Quilmes,...| [-34.7203,-58.2694]|   false|
|  27737029|668129436898885633|Risottinho de moq...|[-22.97027778,-43...|   false|
| 245111438|668128317044826113|fish bowl fridays...|[42.09727335,-75....|   false|
| 543604821|668128518354698242|Viendo el partido...|[19.39476248,-99....|   false|
+----------+------------------+--------------------+--------------------+--------+
only showing top 20 rows


In [5]:
df.printSchema()


root
 |-- user: long (nullable = true)
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- hasMedia: boolean (nullable = true)


In [6]:
textDf = df.select("text")
textDf.show()
textDf.count()


+--------------------+
|                text|
+--------------------+
|e0b40f2381c430f6d...|
|:) https://t.co/r...|
|برد 😊 (@ miral -...|
|christmas market:...|
|يا عزيزي يالمدريد...|
|#noelgeek #ghostb...|
|Soooooo these #ne...|
|Green Turtle in W...|
|#Retail #Job in #...|
|#StaracArabia
الن...|
|Açlık oyunları al...|
|#beaurivagegolf #...|
|@bm0406 @ionacrv ...|
|#Bilinmezlik @ İz...|
|Razón tenía aquel...|
|349.336 personas ...|
|🎉🎉🎉 @ Quilmes,...|
|Risottinho de moq...|
|fish bowl fridays...|
|Viendo el partido...|
+--------------------+
only showing top 20 rows

Out[6]:
2085

In [7]:
df.describe().show()


+-------+--------------------+--------------------+
|summary|                user|                  id|
+-------+--------------------+--------------------+
|  count|                2085|                2085|
|   mean| 8.986858830489209E8|6.679669002385806...|
| stddev|2.3303647371227983E8|                 NaN|
|    min|             1629241|  667127803561844741|
|    max|          4322320759|  668569637496340480|
+-------+--------------------+--------------------+


In [8]:
df.groupBy("hasMedia").count().show()


+--------+-----+
|hasMedia|count|
+--------+-----+
|    true|  118|
|   false| 1967|
+--------+-----+


In [9]:
words = textDf.flatMap(lambda row: row.text.split(" "))

In [10]:
from pyspark.sql import Row
words_df = words.map(lambda w: Row(word=w, cnt=1)).toDF()

In [11]:
word_counts = words_df.groupBy("word").sum()

In [12]:
word_counts.show()


+--------------------+--------+
|                word|sum(cnt)|
+--------------------+--------+
|          #livemusic|       1|
|                  MS|       2|
|               check|       7|
|           PORTLAND,|       1|
|          Beylikdüzü|       2|
|                Club|       7|
|              🍸🍹🍷|       1|
|              casar,|       1|
|           #ig_cuneo|       1|
|                 7°C|       1|
|                foot|       1|
|        Platforms...|       1|
|                Alto|       1|
|               Sweet|       2|
|              Porto,|       1|
|                  мы|       2|
|https://t.co/a8Da...|       1|
|          disfrutar…|       1|
|            Uvarovo,|       1|
|https://t.co/UfuV...|       1|
+--------------------+--------+
only showing top 20 rows


In [13]:
words_df.registerTempTable("words")

In [14]:
sqlContext.sql("SELECT * FROM words").show()


+---+--------------------+
|cnt|                word|
+---+--------------------+
|  1|e0b40f2381c430f6d...|
|  1|                  :)|
|  1|https://t.co/rIRY...|
|  1|                 برد|
|  1|                  😊|
|  1|                  (@|
|  1|               miral|
|  1|                   -|
|  1|               ميرال|
|  1|                  in|
|  1|             Kuwait)|
|  1|https://t.co/YfNO...|
|  1|           christmas|
|  1|             market:|
|  1|                that|
|  1|                time|
|  1|                  of|
|  1|                 the|
|  1|                year|
|  1|               again|
+---+--------------------+
only showing top 20 rows


In [15]:
sqlContext.sql("""
SELECT * FROM words
""").show()


+---+--------------------+
|cnt|                word|
+---+--------------------+
|  1|e0b40f2381c430f6d...|
|  1|                  :)|
|  1|https://t.co/rIRY...|
|  1|                 برد|
|  1|                  😊|
|  1|                  (@|
|  1|               miral|
|  1|                   -|
|  1|               ميرال|
|  1|                  in|
|  1|             Kuwait)|
|  1|https://t.co/YfNO...|
|  1|           christmas|
|  1|             market:|
|  1|                that|
|  1|                time|
|  1|                  of|
|  1|                 the|
|  1|                year|
|  1|               again|
+---+--------------------+
only showing top 20 rows


In [16]:
sqlContext.sql("""
SELECT word,sum(cnt)
FROM words
GROUP BY word
""").show()


+--------------------+---+
|                word|_c1|
+--------------------+---+
|          #livemusic|  1|
|                  MS|  2|
|               check|  7|
|           PORTLAND,|  1|
|          Beylikdüzü|  2|
|                Club|  7|
|              🍸🍹🍷|  1|
|              casar,|  1|
|           #ig_cuneo|  1|
|                 7°C|  1|
|                foot|  1|
|        Platforms...|  1|
|                Alto|  1|
|               Sweet|  2|
|              Porto,|  1|
|                  мы|  2|
|https://t.co/a8Da...|  1|
|          disfrutar…|  1|
|            Uvarovo,|  1|
|https://t.co/UfuV...|  1|
+--------------------+---+
only showing top 20 rows


In [17]:
sqlContext.sql("""
SELECT word,sum(cnt) as c
FROM words
GROUP BY word
ORDER BY c DESC
""").show()


+----------+---+
|      word|  c|
+----------+---+
|        at|514|
|        in|499|
|         @|401|
|       I'm|344|
|         -|232|
|         —|184|
|         a|180|
|          |153|
|        (@|143|
|       the|124|
|       and|118|
|       for|117|
|   #Hiring|112|
|        de|108|
|        to|106|
|        w/| 97|
|       (at| 79|
|       you| 74|
|     #Jobs| 72|
|#CareerArc| 72|
+----------+---+
only showing top 20 rows