Computing Word Frequency for Brand Perception Interviews amongst key stakeholders NYCAASC
In [1]:
import re
from nltk import word_tokenize, pos_tag
import math
import string
from collections import Counter
import pandas as pd
In [3]:
# only need to run once
nltk.download()
In [3]:
df = pd.read_csv("/Users/chuamelia/Downloads/act_reviews.tsv",sep="\t")
Looking at Structure of Data
In [4]:
df.head(1)
Out[4]:
In [14]:
#Write function to append all tokens to one list.
def stuff_tokenizer(column, list):
discard = ['IN', ',','.','TO', 'DT', 'PRP', 'CC', 'are', 'is', 'um', u'it\u2019s', 'PRP$']
end_num = len(column)
temp = []
for i in range(end_num): #append to one list
temp.extend(word_tokenize(str(column[i]).decode('utf-8')))
temp2 = pos_tag(temp) #tag words
for i in temp2: #discard prepositions, articles, etc.
if i[1] not in discard and i[0] not in discard:
list.append(i)
In [6]:
#add decode('utf-8') bc "\xe2\x80\x99" interprtation
#ascii' codec can't decode byte
#example: df['first_Time'][0]
Create empty lists for stuffing.
In [10]:
q1 = []
q2 = []
q3 = []
q4 = []
Stuff Away!
In [11]:
stuff_tokenizer(df['1'],q1)
stuff_tokenizer(df['2'],q2)
In [16]:
stuff_tokenizer(df['3'],q3)
In [15]:
stuff_tokenizer(df['4'],q4)
Checking if stuffing worked...
In [12]:
print q1[:3]
In [17]:
dataset = [q1,q2,q3,q4]
In [18]:
for i in dataset:
common = Counter(i)
print common.most_common(5)
# Need to remove prepositions
# How can we control for one person repeating the same word?
# select distinct words: my_list = list(set(my_list))
# compare word counts
In [19]:
Counter(q4).most_common()
Out[19]:
In [20]:
cmn = Counter(q4)
In [21]:
cmn = {k: v for k, v in cmn.iteritems() if v > 10}
In [22]:
cmn
Out[22]: