In [2]:
from pyspark.sql import SparkSession
from nltk.tokenize import TextTilingTokenizer
from sentimentAnalysis import dataProcessing as dp
In [3]:
# create spark session
spark = SparkSession(sc)
In [19]:
# get dataframes
# specify s3 as sourc with s3a://
#df = spark.read.json("s3a://amazon-review-data/user_dedup.json.gz")
#df_meta = spark.read.json("s3a://amazon-review-data/metadata.json.gz")
# get shard
df_raw_data = spark.read.json("s3a://amazon-review-data/reviews_Musical_Instruments_5.json.gz")
# subset asin, reviewText
df_subset = df_raw_data.select("asin", "reviewText")
In [14]:
import nltk.data
# instantiate punkt object
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
In [9]:
# get test row
test_row = df_subset.first()
test_row
Out[9]:
In [15]:
# test tiler
sent_detector.tokenize(test_row["reviewText"].strip())
Out[15]:
In [17]:
from pyspark.sql.functions import udf
In [20]:
# create udf
punkt_udf = udf(lambda x: sent_detector.tokenize(x.encode("utf-8")))
# apply udf, create new column
df_punkt = df_subset.withColumn("sentenceTokens", punkt_udf(df_subset["reviewText"]))
df_punkt.show(3)
In [ ]:
#df_obj_only.write.json("s3a://amazon-review-data/review-data")