Computing Word Frequency for Brand Perception Interviews amongst key stakeholders NYCAASC
In [64]:
import re
import nltk
import math
import string
from collections import Counter
import pandas as pd
In [71]:
from __future__ import unicode_literals
In [68]:
# only need to run once
nltk.download()
Out[68]:
In [72]:
df = pd.read_csv("/Users/chuamelia/Downloads/Brand_IDI_Qual.tsv",sep="\t")
Looking at Structure of Data
In [73]:
df.head(1)
Out[73]:
In [168]:
#Write function to append all tokens to one list.
def stuff_tokenizer(column, list):
discard = ['IN', ',','.','TO', 'DT', 'PRP', 'CC']
end_num = len(column)
temp = []
for i in range(end_num): #append to one list
temp.extend(nltk.word_tokenize(column[i].decode('utf-8')))
temp2 = nltk.pos_tag(temp) #tag words
for i in temp2: #discard prepositions, articles, etc.
if i[1] not in discard:
list.append(i)
In [74]:
#add decode('utf-8') bc "\xe2\x80\x99" interprtation
#ascii' codec can't decode byte
#example: df['first_Time'][0]
Out[74]:
Create empty lists for stuffing.
In [163]:
q1a = []
q1b = []
q2 = []
q3 = []
q4 = []
q5 = []
Stuff Away!
In [170]:
stuff_tokenizer(df['a_position_toNYCAASC'],q1a)
stuff_tokenizer(df['b_position_toNYCAASC'],q1b)
stuff_tokenizer(df['embody_Mission'],q2)
stuff_tokenizer(df['future_Direction'],q3)
stuff_tokenizer(df['first_Time'],q4)
stuff_tokenizer(df['logo'],q5)
Checking if stuffing worked...
In [131]:
print q1a[:3]
In [152]:
dataset = [q1a,q1b,q2,q3,q4,q5]
In [153]:
for i in dataset:
common = Counter(i)
print common.most_common(5)
# Need to remove prepositions
# How can we control for one person repeating the same word?
# select distinct words: my_list = list(set(my_list))
# compare word counts
In [ ]: