In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [7]:
sqlContext = SQLContext(sc)
comments = sqlContext.read.json("../data/HackerNews/small.json")
total_comments = comments.count()
print(total_comments)


1

In [8]:
comments.printSchema()


root
 |-- hits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _highlightResult: struct (nullable = true)
 |    |    |    |-- author: struct (nullable = true)
 |    |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- comment_text: struct (nullable = true)
 |    |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- story_title: struct (nullable = true)
 |    |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- story_url: struct (nullable = true)
 |    |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |-- _tags: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- comment_text: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- created_at_i: long (nullable = true)
 |    |    |-- num_comments: string (nullable = true)
 |    |    |-- objectID: string (nullable = true)
 |    |    |-- parent_id: long (nullable = true)
 |    |    |-- points: long (nullable = true)
 |    |    |-- story_id: long (nullable = true)
 |    |    |-- story_text: string (nullable = true)
 |    |    |-- story_title: string (nullable = true)
 |    |    |-- story_url: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- nbHits: long (nullable = true)
 |-- nbPages: long (nullable = true)
 |-- page: long (nullable = true)


In [11]:
print(comments.select(comments['hits']).count())


1

In [33]:
hits = comments.select(sql.explode(comments['hits']))
hits.printSchema()


root
 |-- col: struct (nullable = true)
 |    |-- _highlightResult: struct (nullable = true)
 |    |    |-- author: struct (nullable = true)
 |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- value: string (nullable = true)
 |    |    |-- comment_text: struct (nullable = true)
 |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- value: string (nullable = true)
 |    |    |-- story_title: struct (nullable = true)
 |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- value: string (nullable = true)
 |    |    |-- story_url: struct (nullable = true)
 |    |    |    |-- matchLevel: string (nullable = true)
 |    |    |    |-- matchedWords: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- value: string (nullable = true)
 |    |-- _tags: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- author: string (nullable = true)
 |    |-- comment_text: string (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- created_at_i: long (nullable = true)
 |    |-- num_comments: string (nullable = true)
 |    |-- objectID: string (nullable = true)
 |    |-- parent_id: long (nullable = true)
 |    |-- points: long (nullable = true)
 |    |-- story_id: long (nullable = true)
 |    |-- story_text: string (nullable = true)
 |    |-- story_title: string (nullable = true)
 |    |-- story_url: string (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- url: string (nullable = true)


In [67]:
def mappings(r):
    return Row(
        _tags=r['col']['_tags'],
        author=r['col']['author'],
        comment_text=r['col']['comment_text'],      
        created_at=r['col']['created_at'],
        created_at_i=r['col']['created_at_i'],
        num_comments=str(r['col']['num_comments']),
        objectID=r['col']['objectID'],
        parent_id=r['col']['parent_id'],
        points=r['col']['points'],
        story_id=r['col']['story_id'],
        story_text=str(r['col'].asDict().get('story_text', "")),
        story_title=r['col']['story_title'],
        story_url=str(r['col']['story_url']),
        title=str(r['col']['title']),
        url=str(r['col']['url'])
    )

hits2 = hits.map(lambda r: mappings(r)).toDF()
hits2.head()
print(hits2.count())
hits2.printSchema()


66
root
 |-- _tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- author: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- created_at_i: long (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- objectID: string (nullable = true)
 |-- parent_id: long (nullable = true)
 |-- points: long (nullable = true)
 |-- story_id: long (nullable = true)
 |-- story_text: string (nullable = true)
 |-- story_title: string (nullable = true)
 |-- story_url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)


In [70]:
hits2.write.save("../data/HackerNews/from_spark_small.json", format="json")

In [ ]: