Script Test - addTokensAndCats.py

Test notebook for script to add tokens and categories to review data.


Setup


In [1]:
from pyspark.sql import SparkSession
from sentimentAnalysis import dataProcessing as dp

In [2]:
# create spark session
spark = SparkSession(sc)

In [4]:
# check dataframe wrote to s3
df = spark.read.json("s3a://amazon-review-data/review-data.json/test_data.json")

In [5]:
# subset asin, reviewText
df.show(3)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          categories|           cleanText|          raw_tokens|          reviewText|              tokens|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|000727405X|[WrappedArray(Boo...|It really does ha...|[it, really, does...|It really does ha...|[really, happen, ...|
|0007444117|[WrappedArray(Boo...|Great book  Veron...|[great, book, , v...|Great book. Veron...|[great, book, , v...|
|0060294671|[WrappedArray(Boo...|This book is for ...|[this, book, is, ...|This book is for ...|[book, girls, lov...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows


In [4]:
# add tokens
df_tokens = dp.add_tokens(df_subset)

In [ ]:


In [5]:


In [ ]: