Script Development - addTokensAndCats.py

Development notebook for script to add tokens and categories to review data.


Setup


In [2]:
from pyspark.sql import SparkSession
from nltk.tokenize import TextTilingTokenizer
from sentimentAnalysis import dataProcessing as dp

In [3]:
# create spark session
spark = SparkSession(sc)

In [19]:
# get dataframes
# specify s3 as sourc with s3a://
#df = spark.read.json("s3a://amazon-review-data/user_dedup.json.gz")
#df_meta = spark.read.json("s3a://amazon-review-data/metadata.json.gz")

# get shard
df_raw_data = spark.read.json("s3a://amazon-review-data/reviews_Musical_Instruments_5.json.gz")

# subset asin, reviewText
df_subset = df_raw_data.select("asin", "reviewText")


Development

Sentence Tokens

row


In [14]:
import nltk.data

# instantiate punkt object
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
# get test row
test_row = df_subset.first()

test_row


Out[9]:
Row(asin=u'1384719342', reviewText=u"Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,")

In [15]:
# test tiler
sent_detector.tokenize(test_row["reviewText"].strip())


Out[15]:
[u"Not much to write about here, but it does exactly what it's supposed to.",
 u'filters out the pop sounds.',
 u'now my recordings are much more crisp.',
 u'it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,']

data frame


In [17]:
from pyspark.sql.functions import udf

In [20]:
# create udf
punkt_udf = udf(lambda x: sent_detector.tokenize(x.encode("utf-8")))

# apply udf, create new column
df_punkt = df_subset.withColumn("sentenceTokens", punkt_udf(df_subset["reviewText"]))

df_punkt.show(3)


+----------+--------------------+--------------------+
|      asin|          reviewText|      sentenceTokens|
+----------+--------------------+--------------------+
|1384719342|Not much to write...|[Not much to writ...|
|1384719342|The product does ...|[The product does...|
|1384719342|The primary job o...|[The primary job ...|
+----------+--------------------+--------------------+
only showing top 3 rows

Save


In [ ]:
#df_obj_only.write.json("s3a://amazon-review-data/review-data")