Script Development - addTokensAndCats.py

Development notebook for script to add tokens and categories to review data.


Setup


In [1]:
from pyspark.sql import SparkSession
from sentimentAnalysis import dataProcessing as dp

In [2]:
# create spark session
spark = SparkSession(sc)

In [2]:
# get dataframes
# specify s3 as sourc with s3a://
#df = spark.read.json("s3a://amazon-review-data/user_dedup.json.gz")
#df_meta = spark.read.json("s3a://amazon-review-data/metadata.json.gz")
df = spark.read.json("s3a://amazon-review-data/reviews_Musical_Instruments_5.json.gz")

In [3]:
# subset asin, reviewText
df_subset = df.select("asin", "reviewText")

In [4]:
# add tokens
df_tokens = dp.add_tokens(df_subset)

In [6]:
# add categories
df_cats = dp.add_categories(df_tokens, df_meta)


+----------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|          raw_tokens|              tokens|
+----------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, , e...|
+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row


In [5]:
df_tokens.write.json("s3a://amazon-review-data/review-data.json")

In [ ]: