In [18]:
#!/bin/env python
# Feb16_Tokenizing.py
# IEOR242 Applications in Data Analytics 
# Feb 14 2016

In [11]:
import nltk
from nltk.corpus import stopwords
import pandas as pd

In [9]:
# Import Dictionary
Library = pd.DataFrame(pd.read_csv("../data/LoughranMcDonald_MasterDictionary_2014.csv"))

In [6]:
# PATHS to our scrapped 10K files
CP2012_MDNA = "../data/company10k/CP2012_MDNA.txt"
CP2013_MDNA = "../data/company10k/../data/company10k/company10k/CP2013_MDNA.txt"
CP2014_MDNA = "../data/company10k/company10k/CP2014_MDNA.txt"
PG2012_MDNA = "../data/company10k/company10k/PG2012_MDNA.txt"
PG2013_MDNA = "../data/company10k/company10k/PG2013_MDNA.txt"

In [13]:
MDNA = open(CP2012_MDNA).read().decode('utf8')

In [15]:
WORDTOKEN = nltk.word_tokenize(MDNA)

In [17]:
WORDTOKEN[0:50]


Out[17]:
[u'ITEM',
 u'7',
 u'.',
 u'MANAGEMENT\u2019S',
 u'DISCUSSION',
 u'AND',
 u'ANALYSIS',
 u'OF',
 u'FINANCIAL',
 u'CONDITION',
 u'AND',
 u'RESULTS',
 u'OF',
 u'OPERATIONS',
 u'Executive',
 u'Overview',
 u'and',
 u'Outlook',
 u'Colgate-Palmolive',
 u'Company',
 u'seeks',
 u'to',
 u'deliver',
 u'strong',
 u',',
 u'consistent',
 u'business',
 u'results',
 u'and',
 u'superior',
 u'shareholder',
 u'returns',
 u'by',
 u'providing',
 u'consumers',
 u'globally',
 u'with',
 u'products',
 u'that',
 u'make',
 u'their',
 u'lives',
 u'healthier',
 u'and',
 u'more',
 u'enjoyable',
 u'.',
 u'To',
 u'this',
 u'end']

In [ ]: