notebook.community

Edit and run



In [18]:

    
#!/bin/env python
# Feb16_Tokenizing.py
# IEOR242 Applications in Data Analytics 
# Feb 14 2016



In [11]:

    
import nltk
from nltk.corpus import stopwords
import pandas as pd



In [9]:

    
# Import Dictionary
Library = pd.DataFrame(pd.read_csv("../data/LoughranMcDonald_MasterDictionary_2014.csv"))



In [6]:

    
# PATHS to our scrapped 10K files
CP2012_MDNA = "../data/company10k/CP2012_MDNA.txt"
CP2013_MDNA = "../data/company10k/../data/company10k/company10k/CP2013_MDNA.txt"
CP2014_MDNA = "../data/company10k/company10k/CP2014_MDNA.txt"
PG2012_MDNA = "../data/company10k/company10k/PG2012_MDNA.txt"
PG2013_MDNA = "../data/company10k/company10k/PG2013_MDNA.txt"



In [13]:

    
MDNA = open(CP2012_MDNA).read().decode('utf8')



In [15]:

    
WORDTOKEN = nltk.word_tokenize(MDNA)



In [17]:

    
WORDTOKEN[0:50]









    Out[17]:





[u'ITEM',
 u'7',
 u'.',
 u'MANAGEMENT\u2019S',
 u'DISCUSSION',
 u'AND',
 u'ANALYSIS',
 u'OF',
 u'FINANCIAL',
 u'CONDITION',
 u'AND',
 u'RESULTS',
 u'OF',
 u'OPERATIONS',
 u'Executive',
 u'Overview',
 u'and',
 u'Outlook',
 u'Colgate-Palmolive',
 u'Company',
 u'seeks',
 u'to',
 u'deliver',
 u'strong',
 u',',
 u'consistent',
 u'business',
 u'results',
 u'and',
 u'superior',
 u'shareholder',
 u'returns',
 u'by',
 u'providing',
 u'consumers',
 u'globally',
 u'with',
 u'products',
 u'that',
 u'make',
 u'their',
 u'lives',
 u'healthier',
 u'and',
 u'more',
 u'enjoyable',
 u'.',
 u'To',
 u'this',
 u'end']



In [ ]: