Computing Word Frequency for Brand Perception Interviews amongst key stakeholders NYCAASC

Importing Libraries


In [1]:
import re
from nltk import word_tokenize, pos_tag
import math
import string
from collections import Counter
import pandas as pd

In [3]:
# only need to run once
nltk.download()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-60474dd24e0a> in <module>()
      1 # only need to run once
----> 2 nltk.download()

NameError: name 'nltk' is not defined

Reading in Data


In [3]:
df = pd.read_csv("/Users/chuamelia/Downloads/act_reviews.tsv",sep="\t")

Looking at Structure of Data


In [4]:
df.head(1)


Out[4]:
1 2 3 4
0 Everything! It worked out smoothly Asian American Media The speakers were interes... Nothing, its great! More workshops? More entertainment, like performers during sna...

Prepare Functions for Analysis


In [14]:
#Write function to append all tokens to one list.
def stuff_tokenizer(column, list):
    discard = ['IN', ',','.','TO', 'DT', 'PRP', 'CC', 'are', 'is', 'um', u'it\u2019s', 'PRP$']
    end_num = len(column)
    temp = []
    for i in range(end_num): #append to one list
        temp.extend(word_tokenize(str(column[i]).decode('utf-8')))
    temp2 = pos_tag(temp) #tag words
    for i in temp2: #discard prepositions, articles, etc.
        if i[1] not in discard and i[0] not in discard: 
            list.append(i)

In [6]:
#add decode('utf-8') bc "\xe2\x80\x99" interprtation
#ascii' codec can't decode byte
#example: df['first_Time'][0]

Create empty lists for stuffing.


In [10]:
q1 = []
q2 = []
q3 = []
q4 = []

Stuff Away!


In [11]:
stuff_tokenizer(df['1'],q1)
stuff_tokenizer(df['2'],q2)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-332fe8393c09> in <module>()
      1 stuff_tokenizer(df['1'],q1)
      2 stuff_tokenizer(df['2'],q2)
----> 3 stuff_tokenizer(df['3'],q3)
      4 stuff_tokenizer(df['4'],q4)

<ipython-input-5-53e0456a0fae> in stuff_tokenizer(column, list)
      5     temp = []
      6     for i in range(end_num): #append to one list
----> 7         temp.extend(word_tokenize(column[i].decode('utf-8')))
      8     temp2 = pos_tag(temp) #tag words
      9     for i in temp2: #discard prepositions, articles, etc.

AttributeError: 'float' object has no attribute 'decode'

In [16]:
stuff_tokenizer(df['3'],q3)

In [15]:
stuff_tokenizer(df['4'],q4)

Checking if stuffing worked...


In [12]:
print q1[:3]


[(u'Everything', 'NN'), (u'worked', 'VBD'), (u'out', 'RP')]

In [17]:
dataset = [q1,q2,q3,q4]

In [18]:
for i in dataset:
    common = Counter(i)
    print common.most_common(5)
# Need to remove prepositions
# How can we control for one person repeating the same word?
# select distinct words: my_list = list(set(my_list))
# compare word counts


[((u'people', 'NNS'), 9), ((u'workshops', 'NNS'), 9), ((u'liked', 'VBD'), 8), ((u'was', 'VBD'), 7), ((u'were', 'VBD'), 7)]
[((u'was', 'VBD'), 9), ((u'-', ':'), 8), ((u'Media', 'NNP'), 7), ((u'very', 'RB'), 6), ((u':', ':'), 6)]
[((u'More', 'JJR'), 11), ((u'workshops', 'NNS'), 7), ((u'time', 'NN'), 6), ((u'breakout', 'NN'), 5), ((u'more', 'RBR'), 4)]
[((u'More', 'JJR'), 10), ((u'More', 'RBR'), 8), ((u'Asian', 'JJ'), 7), ((u'workshops', 'NNS'), 6), ((u'people', 'NNS'), 4)]

In [19]:
Counter(q4).most_common()


Out[19]:
[((u'More', 'JJR'), 10),
 ((u'More', 'RBR'), 8),
 ((u'Asian', 'JJ'), 7),
 ((u'workshops', 'NNS'), 6),
 ((u'people', 'NNS'), 4),
 ((u'panels', 'NNS'), 3),
 ((u'interactive', 'JJ'), 3),
 ((u'more', 'JJR'), 3),
 ((u'would', 'MD'), 3),
 ((u'see', 'VB'), 3),
 ((u'topics', 'NNS'), 3),
 ((u'AA', 'NNP'), 2),
 ((u'interaction', 'NN'), 2),
 ((u'performers', 'NNS'), 2),
 ((u'More', 'NNP'), 2),
 ((u'group', 'NN'), 2),
 ((u'Different', 'NNP'), 2),
 ((u'more', 'RBR'), 2),
 ((u'like', 'VB'), 2),
 ((u'culture', 'NN'), 2),
 ((u'sure', 'JJ'), 2),
 ((u'different', 'JJ'), 2),
 ((u'nan', 'RB'), 2),
 ((u'NYU', 'NNP'), 2),
 ((u'nan', 'JJ'), 2),
 ((u'time', 'NN'), 2),
 ((u'auditorium', 'NN'), 1),
 ((u'Form', 'NNP'), 1),
 ((u'acitivites', 'NNS'), 1),
 ((u'Jin', 'NNP'), 1),
 ((u'issues', 'NNS'), 1),
 ((u'media', 'NNS'), 1),
 ((u'dance', 'NN'), 1),
 ((u'track', 'NN'), 1),
 ((u'3rd', 'CD'), 1),
 ((u'form', 'VBP'), 1),
 ((u'Bubble', 'NNP'), 1),
 ((u'Water', 'NNP'), 1),
 ((u'broad', 'JJ'), 1),
 ((u'Hirano', 'NNP'), 1),
 ((u'effect', 'NN'), 1),
 ((u'come', 'VB'), 1),
 ((u'sucessful', 'JJ'), 1),
 ((u'``', '``'), 1),
 ((u'Women', 'NNP'), 1),
 ((u'same', 'JJ'), 1),
 ((u'economy', 'NN'), 1),
 ((u'Invite', 'NNP'), 1),
 ((u'fun', 'NN'), 1),
 ((u'who', 'WP'), 1),
 ((u'diversity', 'JJ'), 1),
 ((u'come', 'VBP'), 1),
 ((u'that', 'WDT'), 1),
 ((u'have', 'VB'), 1),
 ((u'great', 'JJ'), 1),
 ((u'enjoyd', 'VBP'), 1),
 ((u'snacktime', 'JJ'), 1),
 ((u'entertainment', 'NN'), 1),
 ((u'sit', 'VB'), 1),
 ((u'Same', 'NNP'), 1),
 ((u'Indian', 'JJ'), 1),
 ((u'particpants', 'NNS'), 1),
 ((u'schoolers', 'NNS'), 1),
 ((u'Tea', 'NNP'), 1),
 ((u'love', 'VB'), 1),
 ((u'interesting', 'JJ'), 1),
 ((u'high', 'JJ'), 1),
 ((u'Dancing', 'NNP'), 1),
 ((u'other', 'JJ'), 1),
 ((u'Show', 'VB'), 1),
 ((u'workshop', 'NN'), 1),
 ((u'Similar', 'JJ'), 1),
 ((u'schedule', 'VBZ'), 1),
 ((u'eat', 'VB'), 1),
 ((u'could', 'MD'), 1),
 ((u'catered', 'VBD'), 1),
 ((u'year', 'NN'), 1),
 ((u'came', 'VBD'), 1),
 ((u'workshps', 'VBD'), 1),
 ((u'dialogue', 'NN'), 1),
 ((u'application', 'NN'), 1),
 ((u'changing', 'VBG'), 1),
 ((u'think', 'VBP'), 1),
 ((u'crews', 'NNS'), 1),
 ((u'Call', 'NNP'), 1),
 ((u'change', 'VB'), 1),
 ((u'music', 'NN'), 1),
 ((u'tables', 'NNS'), 1),
 ((u'better', 'JJR'), 1),
 ((u'abuility', 'NN'), 1),
 ((u'becoming', 'VBG'), 1),
 ((u'anything', 'NN'), 1),
 ((u'audience', 'NN'), 1),
 ((u'should', 'MD'), 1),
 ((u'members', 'NNS'), 1),
 ((u'quality', 'NN'), 1),
 ((u'trafficing', 'VBG'), 1),
 ((u'has', 'VBZ'), 1),
 ((u'endure', 'VB'), 1),
 ((u'clubs', 'NNS'), 1),
 ((u'make', 'VB'), 1),
 ((u'campus', 'NN'), 1),
 ((u'cultural', 'JJ'), 1),
 ((u'identities', 'NNS'), 1),
 ((u'Society', 'NNP'), 1),
 ((u'struggles', 'NNS'), 1),
 ((u'experiences', 'NNS'), 1),
 ((u'activites', 'NNS'), 1),
 ((u'effort', 'NN'), 1),
 ((u'Empowerment', 'NN'), 1),
 ((u'large', 'JJ'), 1),
 ((u'roles', 'NNS'), 1),
 ((u')', ')'), 1),
 ((u'women', 'NNS'), 1),
 ((u'Bottled', 'NNP'), 1),
 ((u"''", "''"), 1),
 ((u'Americans', 'NNPS'), 1),
 ((u'Sex', 'NNP'), 1),
 ((u'(', '('), 1),
 ((u'specific', 'JJ'), 1),
 ((u'school', 'NN'), 1),
 ((u'groups', 'NNS'), 1),
 ((u'Feminity', 'NNP'), 1),
 ((u'Trafficking', 'NNP'), 1),
 ((u'enough', 'JJ'), 1),
 ((u'speakers', 'NNS'), 1),
 ((u'country', 'NN'), 1),
 ((u'Ex', 'NNP'), 1),
 ((u'things', 'NNS'), 1),
 ((u'Aya', 'NNP'), 1),
 ((u'general', 'JJ'), 1),
 ((u'bonding', 'JJ'), 1),
 ((u'art', 'NN'), 1),
 ((u'workshop', 'VBD'), 1),
 ((u'industry', 'NN'), 1),
 ((u'profession', 'NN'), 1),
 ((u'one', 'NN'), 1),
 ((u'Not', 'RB'), 1),
 ((u'Better', 'VBP'), 1),
 ((u'Cant', 'JJ'), 1),
 ((u'controversial', 'JJ'), 1),
 ((u'Maybe', 'RB'), 1),
 ((u'performances', 'NNS'), 1),
 ((u'Build', 'NNP'), 1),
 ((u'panelists', 'NNS'), 1),
 ((u'workshops', 'VBZ'), 1),
 ((u'hip/hop', 'NN'), 1)]

In [20]:
cmn = Counter(q4)

In [21]:
cmn = {k: v for k, v in cmn.iteritems() if  v > 10}

In [22]:
cmn


Out[22]:
{}