Script Development - `addPosTags.py`

Development notebook for script to add tokens and categories to review data.

Setup



In [1]:

    
import pyspark as ps
from sentimentAnalysis import dataProcessing as dp



In [2]:

    
# create spark session
spark = ps.sql.SparkSession(sc)



In [7]:

    
# get dataframes
# specify s3 as sourc with s3a://
#df = spark.read.json("s3a://amazon-review-data/user_dedup.json.gz")
#df_meta = spark.read.json("s3a://amazon-review-data/metadata.json.gz")

# get shard
df_raw_data = spark.read.json("s3a://amazon-review-data/reviews_Musical_Instruments_5.json.gz")

# subset asin, reviewText
df_subset = df_raw_data.select("asin", "reviewText")

df_tokens = dp.add_tokens(df_subset)

Development

Add tri-grams



In [8]:

    
from pyspark.ml.feature import NGram



In [10]:

    
# instantiate ngram object
ngram = NGram(n=3, inputCol="rawTokens", outputCol="triGrams")

# add ngrams
df_triGrams = ngram.transform(df_tokens)



In [11]:

    
df_triGrams.show(3)









    



+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|            triGrams|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[not much to, muc...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[the product does...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[the primary job,...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

Add Pos Tags

row



In [18]:

    
import nltk



In [13]:

    
# get test row
test_row = df_triGrams.first()

type(test_row["triGrams"])









    Out[13]:





list



In [7]:

    
# test tiler
nltk.pos_tag(test_row["tokens"])









    Out[7]:





[(u'much', 'JJ'),
 (u'write', 'NN'),
 (u'exactly', 'RB'),
 (u"it's", 'VBZ'),
 (u'supposed', 'VBN'),
 (u'filters', 'NNS'),
 (u'pop', 'VBP'),
 (u'sounds', 'VBZ'),
 (u'recordings', 'NNS'),
 (u'much', 'RB'),
 (u'crisp', 'VBP'),
 (u'one', 'CD'),
 (u'lowest', 'JJS'),
 (u'prices', 'NNS'),
 (u'pop', 'NN'),
 (u'filters', 'NNS'),
 (u'amazon', 'VBP'),
 (u'might', 'MD'),
 (u'well', 'RB'),
 (u'buy', 'VB'),
 (u'honestly', 'RB'),
 (u'work', 'NN'),
 (u'despite', 'IN'),
 (u'pricing', 'VBG')]

data frame



In [32]:

    
from pyspark.sql.types import ArrayType, StringType



In [17]:

    
# create udf
pos_udf = ps.sql.functions.udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))

# apply udf, create new column
df_posTag = df_tokens.withColumn("posTags", pos_udf(df_tokens["tokens"]))

df_posTag.show(3)









    



+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|             posTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[WrappedArray(muc...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[WrappedArray(pro...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[WrappedArray(pri...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [19]:

    
df_posTag.select("posTags").first()









    Out[19]:





Row(posTags=[[u'much', u'JJ'], [u'write', u'NN'], [u'exactly', u'RB'], [u"it's", u'VBZ'], [u'supposed', u'VBN'], [u'filters', u'NNS'], [u'pop', u'VBP'], [u'sounds', u'VBZ'], [u'recordings', u'NNS'], [u'much', u'RB'], [u'crisp', u'VBP'], [u'one', u'CD'], [u'lowest', u'JJS'], [u'prices', u'NNS'], [u'pop', u'NN'], [u'filters', u'NNS'], [u'amazon', u'VBP'], [u'might', u'MD'], [u'well', u'RB'], [u'buy', u'VB'], [u'honestly', u'RB'], [u'work', u'NN'], [u'despite', u'IN'], [u'pricing', u'VBG']])

Tri Gram POS Tags

row



In [15]:

    
test_row["triGrams"][:10]









    Out[15]:





[u'not much to',
 u'much to write',
 u'to write about',
 u'write about here',
 u'about here but',
 u'here but it',
 u'but it does',
 u'it does exactly',
 u'does exactly what',
 u"exactly what it's"]



In [28]:

    
def tag_triGrams(triGrams):
    tagged = []
    for triGram in triGrams:
        tagged.append(nltk.pos_tag(triGram.split()))
    
    return tagged



In [21]:

    
test_row["triGrams"][0].split()









    Out[21]:





[u'not', u'much', u'to']



In [30]:

    
tag_triGrams(test_row["triGrams"])[:10]









    Out[30]:





[[(u'not', 'RB'), (u'much', 'JJ'), (u'to', 'TO')],
 [(u'much', 'JJ'), (u'to', 'TO'), (u'write', 'VB')],
 [(u'to', 'TO'), (u'write', 'VB'), (u'about', 'IN')],
 [(u'write', 'NN'), (u'about', 'IN'), (u'here', 'RB')],
 [(u'about', 'IN'), (u'here', 'RB'), (u'but', 'CC')],
 [(u'here', 'RB'), (u'but', 'CC'), (u'it', 'PRP')],
 [(u'but', 'CC'), (u'it', 'PRP'), (u'does', 'VBZ')],
 [(u'it', 'PRP'), (u'does', 'VBZ'), (u'exactly', 'RB')],
 [(u'does', 'VBZ'), (u'exactly', 'RB'), (u'what', 'WP')],
 [(u'exactly', 'RB'), (u'what', 'WP'), (u"it's", 'NN')]]



In [38]:

    
# create udf
pos_triTag_udf = ps.sql.functions.udf(lambda x: tag_triGrams(x), ArrayType(ArrayType(ArrayType(StringType()))))

# apply udf, create new column
df_triPosTags = df_triGrams.withColumn("triPosTags", pos_triTag_udf(df_triGrams["triGrams"]))

df_triPosTags.show(3)









    



+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|            triGrams|          triPosTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[not much to, muc...|[WrappedArray(Wra...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[the product does...|[WrappedArray(Wra...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[the primary job,...|[WrappedArray(Wra...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [39]:

    
test_row = df_triPosTags.first()



In [40]:

    
test_row["triPosTags"]









    Out[40]:





[[[u'not', u'RB'], [u'much', u'JJ'], [u'to', u'TO']],
 [[u'much', u'JJ'], [u'to', u'TO'], [u'write', u'VB']],
 [[u'to', u'TO'], [u'write', u'VB'], [u'about', u'IN']],
 [[u'write', u'NN'], [u'about', u'IN'], [u'here', u'RB']],
 [[u'about', u'IN'], [u'here', u'RB'], [u'but', u'CC']],
 [[u'here', u'RB'], [u'but', u'CC'], [u'it', u'PRP']],
 [[u'but', u'CC'], [u'it', u'PRP'], [u'does', u'VBZ']],
 [[u'it', u'PRP'], [u'does', u'VBZ'], [u'exactly', u'RB']],
 [[u'does', u'VBZ'], [u'exactly', u'RB'], [u'what', u'WP']],
 [[u'exactly', u'RB'], [u'what', u'WP'], [u"it's", u'NN']],
 [[u'what', u'WP'], [u"it's", u'NN'], [u'supposed', u'VBD']],
 [[u"it's", u'NN'], [u'supposed', u'VBD'], [u'to', u'TO']],
 [[u'supposed', u'VBN'], [u'to', u'TO'], [u'filters', u'NNS']],
 [[u'to', u'TO'], [u'filters', u'NNS'], [u'out', u'RP']],
 [[u'filters', u'NNS'], [u'out', u'RP'], [u'the', u'DT']],
 [[u'out', u'IN'], [u'the', u'DT'], [u'pop', u'NN']],
 [[u'the', u'DT'], [u'pop', u'NN'], [u'sounds', u'NNS']],
 [[u'pop', u'NN'], [u'sounds', u'NNS'], [u'now', u'RB']],
 [[u'sounds', u'NNS'], [u'now', u'RB'], [u'my', u'PRP$']],
 [[u'now', u'RB'], [u'my', u'PRP$'], [u'recordings', u'NNS']],
 [[u'my', u'PRP$'], [u'recordings', u'NNS'], [u'are', u'VBP']],
 [[u'recordings', u'NNS'], [u'are', u'VBP'], [u'much', u'JJ']],
 [[u'are', u'VBP'], [u'much', u'RB'], [u'more', u'RBR']],
 [[u'much', u'RB'], [u'more', u'RBR'], [u'crisp', u'JJ']],
 [[u'more', u'RBR'], [u'crisp', u'NNS'], [u'it', u'PRP']],
 [[u'crisp', u'NN'], [u'it', u'PRP'], [u'is', u'VBZ']],
 [[u'it', u'PRP'], [u'is', u'VBZ'], [u'one', u'CD']],
 [[u'is', u'VBZ'], [u'one', u'CD'], [u'of', u'IN']],
 [[u'one', u'CD'], [u'of', u'IN'], [u'the', u'DT']],
 [[u'of', u'IN'], [u'the', u'DT'], [u'lowest', u'JJS']],
 [[u'the', u'DT'], [u'lowest', u'JJS'], [u'prices', u'NNS']],
 [[u'lowest', u'JJS'], [u'prices', u'NNS'], [u'pop', u'NN']],
 [[u'prices', u'NNS'], [u'pop', u'VBP'], [u'filters', u'NNS']],
 [[u'pop', u'NN'], [u'filters', u'NNS'], [u'on', u'IN']],
 [[u'filters', u'NNS'], [u'on', u'IN'], [u'amazon', u'NN']],
 [[u'on', u'IN'], [u'amazon', u'NNS'], [u'so', u'RB']],
 [[u'amazon', u'NNS'], [u'so', u'RB'], [u'might', u'MD']],
 [[u'so', u'RB'], [u'might', u'MD'], [u'as', u'IN']],
 [[u'might', u'MD'], [u'as', u'RB'], [u'well', u'RB']],
 [[u'as', u'IN'], [u'well', u'RB'], [u'buy', u'VB']],
 [[u'well', u'RB'], [u'buy', u'VB'], [u'it', u'PRP']],
 [[u'buy', u'VB'], [u'it', u'PRP'], [u'they', u'PRP']],
 [[u'it', u'PRP'], [u'they', u'PRP'], [u'honestly', u'RB']],
 [[u'they', u'PRP'], [u'honestly', u'RB'], [u'work', u'VBP']],
 [[u'honestly', u'RB'], [u'work', u'VBZ'], [u'the', u'DT']],
 [[u'work', u'NN'], [u'the', u'DT'], [u'same', u'JJ']],
 [[u'the', u'DT'], [u'same', u'JJ'], [u'despite', u'IN']],
 [[u'same', u'JJ'], [u'despite', u'IN'], [u'their', u'PRP$']],
 [[u'despite', u'IN'], [u'their', u'PRP$'], [u'pricing', u'NN']]]

Function



In [40]:

    
# import nltk
# from pyspark.sql.types import ArrayType, StringType

def addPosTags(df_tokens):
    # create udf
    pos_udf = ps.sql.functions.udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))

    # apply udf, create new column
    df_posTag = df_tokens.withColumn("posTags", pos_udf(df_tokens["tokens"]))
    df_posTag = df_posTag.withColumn("raw_posTags", pos_udf(df_tokens["rawTokens"]))
    
    return df_posTag



In [41]:

    
# test
df_posTag = addPosTags(df_tokens)

df_posTag.show(3)









    



+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|          reviewText|           cleanText|           rawTokens|              tokens|             posTags|         raw_posTags|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1384719342|Not much to write...|Not much to write...|[not, much, to, w...|[much, write, exa...|[WrappedArray(muc...|[WrappedArray(not...|
|1384719342|The product does ...|The product does ...|[the, product, do...|[product, exactly...|[WrappedArray(pro...|[WrappedArray(the...|
|1384719342|The primary job o...|The primary job o...|[the, primary, jo...|[primary, job, de...|[WrappedArray(pri...|[WrappedArray(the...|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

Filter Pos Tags

We are interested in nouns and adjectives. Nouns identify product features and adjectives expresses customer opinions of those features.

However, we cannot use consecutive adjective/noun or noun/adjective pairs alone. Consider this phrase: The chair was not great. If we only extracted the noun chair and the adjective great, the resulting pair chair great does not accurately reflect the sentiment expressed in the sentence. The adverb not negates the positive connotation of great. This scenario illustrates one of a number of ways in which adjective/noun pair meanings are influenced by neighboring words.

We need a set of POS sequences that can help identify sequences we are interested in. Thanfuklly, such a set exists (Turney, 2002), and we can use it here:

Word 1	Word 2	Word 3
JJ	NN/NS	anything
RB/RBR/RBS	JJ	Not NN or NNS
JJ	JJ	Not NN or NNS
NN/ NNS	JJ	Not NN or NNS
RB/ RBR/ RBS	VB/ VBN/ VBD/ VBG	anything

Citations

Turney, Peter D. 2002. Thumbs Up or Thumbs
Down? Semantic Orientation Applied to
Unsupervised, Classification of Reviews.
Proceedings of the 40th Annual Meeting of
the Association for Computational
Linguistics (ACL'02), Philadelphia,
Pennsylvania, USA, July 8-10, 2002. pp
417-424. NRC 44946


Feature-based Customer Review Mining
Jingye Wang Heng Ren
Department of Computer Science
Stanford University

Identify Tag Sequences

Sequence Regex Patterns



In [50]:

    
tag_seqs_re = [('JJ', '^(NN|NS)', '.*'),
               ('^(RB|RBR|RBS)', 'JJ', '^(?!(NN|NS)).*'),
               ('JJ', 'JJ', '^(?!(NN|NS)).*'),
               ('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*'),
               ('^(RB|RBR|RBS)', '^(VB|VBN|VBD|VBG)', '.*')
              ]

Test on Row



In [52]:

    
# get python regex
import re



In [42]:

    
# get test row
test_row = df_posTag.first()



In [42]:

    
# check triGram tags- want tagged raw tokens (stopwords not removed)
test_row["triPosTags"][:10]









    Out[42]:





[[[u'not', u'RB'], [u'much', u'JJ'], [u'to', u'TO']],
 [[u'much', u'JJ'], [u'to', u'TO'], [u'write', u'VB']],
 [[u'to', u'TO'], [u'write', u'VB'], [u'about', u'IN']],
 [[u'write', u'NN'], [u'about', u'IN'], [u'here', u'RB']],
 [[u'about', u'IN'], [u'here', u'RB'], [u'but', u'CC']],
 [[u'here', u'RB'], [u'but', u'CC'], [u'it', u'PRP']],
 [[u'but', u'CC'], [u'it', u'PRP'], [u'does', u'VBZ']],
 [[u'it', u'PRP'], [u'does', u'VBZ'], [u'exactly', u'RB']],
 [[u'does', u'VBZ'], [u'exactly', u'RB'], [u'what', u'WP']],
 [[u'exactly', u'RB'], [u'what', u'WP'], [u"it's", u'NN']]]



In [66]:

    
# function to check if a tagged triGram matches a single sequence
def is_match(triPosTag, seq):
    # iterate over tags in triPosTag
    for i,el in enumerate(triPosTag):
        print(el[1]+" match "+seq[i])
        # return False if tag does not match sequence
        if not re.match(el[1], seq[i]):
            return False
        
    # returns true if no mismatches found
    return True


def match_pos_seq(taggedTriGram):
    for el in taggedTriGram:
        pass



In [70]:

    
# get test tag
test_triPosTag = test_row["triPosTags"][0]

# create test match tag
test_triPosTag_match = [["a", "NN"], ["b", "JJ"], ["c", "RR"]]



In [71]:

    
# test regex match works
tag_seqs_re[3]









    Out[71]:





('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*')



In [80]:

    
re.match(tag_seqs_re[3][0], "NN")









    Out[80]:





<_sre.SRE_Match at 0x12147f558>



In [69]:

    
# test is_match()

is_match(test_triPosTag_match, tag_seqs_re[3])









    



NN match ^(NN|NS)






    Out[69]:





('^(NN|NS)', 'JJ', '^(?!(NN|NS)).*')

Save



In [ ]:

    
#df_obj_only.write.json("s3a://amazon-review-data/review-data")

Script Development - addPosTags.py