In [1]:
#!/bin/env python
# coding: utf-8
# company_10K_tokenization.py
# IEOR242 Applications in Data Analytics Group 06
# Feb 14 2016

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk import stem
import re
from collections import Counter

In [58]:
#def word_tokenization( text_file ):
#    raw_text = open(text_file).read().decode('utf8')
    # Stemming
#    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
#    non_stop_text = pattern.sub('', raw_text)
    # Tokenization
#    word_token = nltk.word_tokenize(non_stop_text)
    # Count the word frequency
#    word_count = Counter(word_token)
#    return word_count

In [12]:
def word_tokenization( text_file ):
    raw_text = open(text_file).read().decode('utf8')
    raw_text = raw_text.encode('ascii','ignore')
    #Removing numbers and characters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    #Converting to lower case
    letters_only = letters_only.lower() 
    #Tokenization
    word_token = nltk.word_tokenize(letters_only)
    # Removing Stop Words
    word_token = [w for w in word_token if not w in stopwords.words("english")]
    # Stemming
    word_token_final=[]
    for word in word_token:
        try:
            word_token_final.append(stemmer.stem(word))
        except:
            word_token_final.append(word)
    # Word frequency
    word_count_final = Counter(word_token_final)
    return word_count_final

In [13]:
stemmer = stem.PorterStemmer()

In [14]:
# Company names and years in use
company_years = ['CP2012', 'CP2013', 'CP2014', 'PG2012', 'PG2013']

In [15]:
# Paths to the MDNA parts from the scrapped company 10K files
MDNA_files = ["../data/company10k/" + company_years[i] + "_MDNA.txt" for i in range(len(company_years))]

In [16]:
# Word counts of all the MDNA files
word_counts = [word_tokenization(MDNA_files[i]) for i in range(len(MDNA_files))]