Script Development - addPosTags.py

Development notebook for script to add tokens and categories to review data.


Setup


In [1]:
import pyspark as ps
from sentimentAnalysis import dataProcessing as dp

In [2]:
# create spark session
spark = ps.sql.SparkSession(sc)

In [7]:
# get dataframes
# specify s3 as sourc with s3a://
#df = spark.read.json("s3a://amazon-review-data/user_dedup.json.gz")
#df_meta = spark.read.json("s3a://amazon-review-data/metadata.json.gz")

# get shard
df_raw_data = spark.read.json("s3a://amazon-review-data/reviews_Musical_Instruments_5.json.gz")

# subset asin, reviewText
df_subset = df_raw_data.select("asin", "reviewText")

df_tokens = dp.add_tokens(df_subset)


Development

Add tri-grams


In [8]:
from pyspark.ml.feature import NGram

In [10]:
# instantiate ngram object
ngram = NGram(n=3, inputCol="rawTokens", outputCol="triGrams")

# add ngrams
df_triGrams = ngram.transform(df_tokens)

In [11]:
df_triGrams.show(3)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|            triGrams|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[not much to, muc...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[the product does...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[the primary job,...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

Add Pos Tags

row


In [18]:
import nltk

In [13]:
# get test row
test_row = df_triGrams.first()

type(test_row["triGrams"])


Out[13]:
list

In [7]:
# test tiler
nltk.pos_tag(test_row["tokens"])


Out[7]:
[(u'much', 'JJ'),
 (u'write', 'NN'),
 (u'exactly', 'RB'),
 (u"it's", 'VBZ'),
 (u'supposed', 'VBN'),
 (u'filters', 'NNS'),
 (u'pop', 'VBP'),
 (u'sounds', 'VBZ'),
 (u'recordings', 'NNS'),
 (u'much', 'RB'),
 (u'crisp', 'VBP'),
 (u'one', 'CD'),
 (u'lowest', 'JJS'),
 (u'prices', 'NNS'),
 (u'pop', 'NN'),
 (u'filters', 'NNS'),
 (u'amazon', 'VBP'),
 (u'might', 'MD'),
 (u'well', 'RB'),
 (u'buy', 'VB'),
 (u'honestly', 'RB'),
 (u'work', 'NN'),
 (u'despite', 'IN'),
 (u'pricing', 'VBG')]

data frame


In [32]:
from pyspark.sql.types import ArrayType, StringType

In [17]:
# create udf
pos_udf = ps.sql.functions.udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))

# apply udf, create new column
df_posTag = df_tokens.withColumn("posTags", pos_udf(df_tokens["tokens"]))

df_posTag.show(3)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|             posTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[WrappedArray(muc...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[WrappedArray(pro...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[WrappedArray(pri...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows


In [19]:
df_posTag.select("posTags").first()


Out[19]:
Row(posTags=[[u'much', u'JJ'], [u'write', u'NN'], [u'exactly', u'RB'], [u"it's", u'VBZ'], [u'supposed', u'VBN'], [u'filters', u'NNS'], [u'pop', u'VBP'], [u'sounds', u'VBZ'], [u'recordings', u'NNS'], [u'much', u'RB'], [u'crisp', u'VBP'], [u'one', u'CD'], [u'lowest', u'JJS'], [u'prices', u'NNS'], [u'pop', u'NN'], [u'filters', u'NNS'], [u'amazon', u'VBP'], [u'might', u'MD'], [u'well', u'RB'], [u'buy', u'VB'], [u'honestly', u'RB'], [u'work', u'NN'], [u'despite', u'IN'], [u'pricing', u'VBG']])

Tri Gram POS Tags

row


In [15]:
test_row["triGrams"][:10]


Out[15]:
[u'not much to',
 u'much to write',
 u'to write about',
 u'write about here',
 u'about here but',
 u'here but it',
 u'but it does',
 u'it does exactly',
 u'does exactly what',
 u"exactly what it's"]

In [28]:
def tag_triGrams(triGrams):
    tagged = []
    for triGram in triGrams:
        tagged.append(nltk.pos_tag(triGram.split()))
    
    return tagged

In [21]:
test_row["triGrams"][0].split()


Out[21]:
[u'not', u'much', u'to']

In [30]:
tag_triGrams(test_row["triGrams"])[:10]


Out[30]:
[[(u'not', 'RB'), (u'much', 'JJ'), (u'to', 'TO')],
 [(u'much', 'JJ'), (u'to', 'TO'), (u'write', 'VB')],
 [(u'to', 'TO'), (u'write', 'VB'), (u'about', 'IN')],
 [(u'write', 'NN'), (u'about', 'IN'), (u'here', 'RB')],
 [(u'about', 'IN'), (u'here', 'RB'), (u'but', 'CC')],
 [(u'here', 'RB'), (u'but', 'CC'), (u'it', 'PRP')],
 [(u'but', 'CC'), (u'it', 'PRP'), (u'does', 'VBZ')],
 [(u'it', 'PRP'), (u'does', 'VBZ'), (u'exactly', 'RB')],
 [(u'does', 'VBZ'), (u'exactly', 'RB'), (u'what', 'WP')],
 [(u'exactly', 'RB'), (u'what', 'WP'), (u"it's", 'NN')]]

In [38]:
# create udf
pos_triTag_udf = ps.sql.functions.udf(lambda x: tag_triGrams(x), ArrayType(ArrayType(ArrayType(StringType()))))

# apply udf, create new column
df_triPosTags = df_triGrams.withColumn("triPosTags", pos_triTag_udf(df_triGrams["triGrams"]))

df_triPosTags.show(3)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|            triGrams|          triPosTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[not much to, muc...|[WrappedArray(Wra...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[the product does...|[WrappedArray(Wra...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[the primary job,...|[WrappedArray(Wra...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows


In [39]:
test_row = df_triPosTags.first()

In [40]:
test_row["triPosTags"]


Out[40]:
[[[u'not', u'RB'], [u'much', u'JJ'], [u'to', u'TO']],
 [[u'much', u'JJ'], [u'to', u'TO'], [u'write', u'VB']],
 [[u'to', u'TO'], [u'write', u'VB'], [u'about', u'IN']],
 [[u'write', u'NN'], [u'about', u'IN'], [u'here', u'RB']],
 [[u'about', u'IN'], [u'here', u'RB'], [u'but', u'CC']],
 [[u'here', u'RB'], [u'but', u'CC'], [u'it', u'PRP']],
 [[u'but', u'CC'], [u'it', u'PRP'], [u'does', u'VBZ']],
 [[u'it', u'PRP'], [u'does', u'VBZ'], [u'exactly', u'RB']],
 [[u'does', u'VBZ'], [u'exactly', u'RB'], [u'what', u'WP']],
 [[u'exactly', u'RB'], [u'what', u'WP'], [u"it's", u'NN']],
 [[u'what', u'WP'], [u"it's", u'NN'], [u'supposed', u'VBD']],
 [[u"it's", u'NN'], [u'supposed', u'VBD'], [u'to', u'TO']],
 [[u'supposed', u'VBN'], [u'to', u'TO'], [u'filters', u'NNS']],
 [[u'to', u'TO'], [u'filters', u'NNS'], [u'out', u'RP']],
 [[u'filters', u'NNS'], [u'out', u'RP'], [u'the', u'DT']],
 [[u'out', u'IN'], [u'the', u'DT'], [u'pop', u'NN']],
 [[u'the', u'DT'], [u'pop', u'NN'], [u'sounds', u'NNS']],
 [[u'pop', u'NN'], [u'sounds', u'NNS'], [u'now', u'RB']],
 [[u'sounds', u'NNS'], [u'now', u'RB'], [u'my', u'PRP$']],
 [[u'now', u'RB'], [u'my', u'PRP$'], [u'recordings', u'NNS']],
 [[u'my', u'PRP$'], [u'recordings', u'NNS'], [u'are', u'VBP']],
 [[u'recordings', u'NNS'], [u'are', u'VBP'], [u'much', u'JJ']],
 [[u'are', u'VBP'], [u'much', u'RB'], [u'more', u'RBR']],
 [[u'much', u'RB'], [u'more', u'RBR'], [u'crisp', u'JJ']],
 [[u'more', u'RBR'], [u'crisp', u'NNS'], [u'it', u'PRP']],
 [[u'crisp', u'NN'], [u'it', u'PRP'], [u'is', u'VBZ']],
 [[u'it', u'PRP'], [u'is', u'VBZ'], [u'one', u'CD']],
 [[u'is', u'VBZ'], [u'one', u'CD'], [u'of', u'IN']],
 [[u'one', u'CD'], [u'of', u'IN'], [u'the', u'DT']],
 [[u'of', u'IN'], [u'the', u'DT'], [u'lowest', u'JJS']],
 [[u'the', u'DT'], [u'lowest', u'JJS'], [u'prices', u'NNS']],
 [[u'lowest', u'JJS'], [u'prices', u'NNS'], [u'pop', u'NN']],
 [[u'prices', u'NNS'], [u'pop', u'VBP'], [u'filters', u'NNS']],
 [[u'pop', u'NN'], [u'filters', u'NNS'], [u'on', u'IN']],
 [[u'filters', u'NNS'], [u'on', u'IN'], [u'amazon', u'NN']],
 [[u'on', u'IN'], [u'amazon', u'NNS'], [u'so', u'RB']],
 [[u'amazon', u'NNS'], [u'so', u'RB'], [u'might', u'MD']],
 [[u'so', u'RB'], [u'might', u'MD'], [u'as', u'IN']],
 [[u'might', u'MD'], [u'as', u'RB'], [u'well', u'RB']],
 [[u'as', u'IN'], [u'well', u'RB'], [u'buy', u'VB']],
 [[u'well', u'RB'], [u'buy', u'VB'], [u'it', u'PRP']],
 [[u'buy', u'VB'], [u'it', u'PRP'], [u'they', u'PRP']],
 [[u'it', u'PRP'], [u'they', u'PRP'], [u'honestly', u'RB']],
 [[u'they', u'PRP'], [u'honestly', u'RB'], [u'work', u'VBP']],
 [[u'honestly', u'RB'], [u'work', u'VBZ'], [u'the', u'DT']],
 [[u'work', u'NN'], [u'the', u'DT'], [u'same', u'JJ']],
 [[u'the', u'DT'], [u'same', u'JJ'], [u'despite', u'IN']],
 [[u'same', u'JJ'], [u'despite', u'IN'], [u'their', u'PRP$']],
 [[u'despite', u'IN'], [u'their', u'PRP$'], [u'pricing', u'NN']]]

Function


In [40]:
# import nltk
# from pyspark.sql.types import ArrayType, StringType

def addPosTags(df_tokens):
    # create udf
    pos_udf = ps.sql.functions.udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))

    # apply udf, create new column
    df_posTag = df_tokens.withColumn("posTags", pos_udf(df_tokens["tokens"]))
    df_posTag = df_posTag.withColumn("raw_posTags", pos_udf(df_tokens["rawTokens"]))
    
    return df_posTag

In [41]:
# test
df_posTag = addPosTags(df_tokens)

df_posTag.show(3)


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|             posTags|         raw_posTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[WrappedArray(muc...|[WrappedArray(not...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[WrappedArray(pro...|[WrappedArray(the...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[WrappedArray(pri...|[WrappedArray(the...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows


Filter Pos Tags

We are interested in nouns and adjectives. Nouns identify product features and adjectives expresses customer opinions of those features.

However, we cannot use consecutive adjective/noun or noun/adjective pairs alone. Consider this phrase: The chair was not great. If we only extracted the noun chair and the adjective great, the resulting pair chair great does not accurately reflect the sentiment expressed in the sentence. The adverb not negates the positive connotation of great. This scenario illustrates one of a number of ways in which adjective/noun pair meanings are influenced by neighboring words.

We need a set of POS sequences that can help identify sequences we are interested in. Thanfuklly, such a set exists (Turney, 2002), and we can use it here:

Word 1 Word 2 Word 3
JJ NN/NS anything
RB/RBR/RBS JJ Not NN or NNS
JJ JJ Not NN or NNS
NN/ NNS JJ Not NN or NNS
RB/ RBR/ RBS VB/ VBN/ VBD/ VBG anything



Citations

Turney, Peter D. 2002. Thumbs Up or Thumbs
Down? Semantic Orientation Applied to
Unsupervised, Classification of Reviews.
Proceedings of the 40th Annual Meeting of
the Association for Computational
Linguistics (ACL'02), Philadelphia,
Pennsylvania, USA, July 8-10, 2002. pp
417-424. NRC 44946


Feature-based Customer Review Mining
Jingye Wang Heng Ren
Department of Computer Science
Stanford University


Identify Tag Sequences

Sequence Regex Patterns


In [50]:
tag_seqs_re = [('JJ', '^(NN|NS)', '.*'),
               ('^(RB|RBR|RBS)', 'JJ', '^(?!(NN|NS)).*'),
               ('JJ', 'JJ', '^(?!(NN|NS)).*'),
               ('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*'),
               ('^(RB|RBR|RBS)', '^(VB|VBN|VBD|VBG)', '.*')
              ]

Test on Row


In [52]:
# get python regex
import re

In [42]:
# get test row
test_row = df_posTag.first()

In [42]:
# check triGram tags- want tagged raw tokens (stopwords not removed)
test_row["triPosTags"][:10]


Out[42]:
[[[u'not', u'RB'], [u'much', u'JJ'], [u'to', u'TO']],
 [[u'much', u'JJ'], [u'to', u'TO'], [u'write', u'VB']],
 [[u'to', u'TO'], [u'write', u'VB'], [u'about', u'IN']],
 [[u'write', u'NN'], [u'about', u'IN'], [u'here', u'RB']],
 [[u'about', u'IN'], [u'here', u'RB'], [u'but', u'CC']],
 [[u'here', u'RB'], [u'but', u'CC'], [u'it', u'PRP']],
 [[u'but', u'CC'], [u'it', u'PRP'], [u'does', u'VBZ']],
 [[u'it', u'PRP'], [u'does', u'VBZ'], [u'exactly', u'RB']],
 [[u'does', u'VBZ'], [u'exactly', u'RB'], [u'what', u'WP']],
 [[u'exactly', u'RB'], [u'what', u'WP'], [u"it's", u'NN']]]

In [66]:
# function to check if a tagged triGram matches a single sequence
def is_match(triPosTag, seq):
    # iterate over tags in triPosTag
    for i,el in enumerate(triPosTag):
        print(el[1]+" match "+seq[i])
        # return False if tag does not match sequence
        if not re.match(el[1], seq[i]):
            return False
        
    # returns true if no mismatches found
    return True


def match_pos_seq(taggedTriGram):
    for el in taggedTriGram:
        pass

In [70]:
# get test tag
test_triPosTag = test_row["triPosTags"][0]

# create test match tag
test_triPosTag_match = [["a", "NN"], ["b", "JJ"], ["c", "RR"]]

In [71]:
# test regex match works
tag_seqs_re[3]


Out[71]:
('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*')

In [80]:
re.match(tag_seqs_re[3][0], "NN")


Out[80]:
<_sre.SRE_Match at 0x12147f558>

In [69]:
# test is_match()

is_match(test_triPosTag_match, tag_seqs_re[3])


NN match ^(NN|NS)
Out[69]:
('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*')


Save


In [ ]:
#df_obj_only.write.json("s3a://amazon-review-data/review-data")