In [1]:

    
import sys
import string 
import time
import timeit
import re
import nltk
import os

from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter

# Stemming
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()

# Stopwords
dict_stop = stopwords.words('english')

Text Tidying Pipeline



In [ ]:

    
def processing(line):
        line = line.lower().strip()
        line = re.sub(r'<(.*?)>',' ',line)
        line = re.sub(r'[\w\d]{20,}','',line)
        line = re.sub(r'[\d]','',line)
        line = re.sub(r'[.]{1}','',line)
        line = re.sub(r'- the new york times','',line)
        line = line.translate(dict.fromkeys(map(ord,punc)))
        line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('•','').replace('‘','').replace('■','').replace('—','')
        words = [word for word in nltk.word_tokenize(line) if word not in dict_stop]
        line = ' '.join(list(map(stemmer.stem,words)))
        if len(line) != 0:
            return(line.strip())



In [ ]:

    
# Insert directory:
dirname = '/Users/mueric35/Box Sync/nytimes_articles'

for filename in os.listdir(dirname):
    
    print("Loading: %s" % filename,file=sys.stderr)
    
    ### Read Data as lines
    lines = open(dirname + '/' + filename).readlines()
    
    ### Words Processing Pipeline
    punc = nltk.word_tokenize(string.punctuation)
    punc.remove('*+')
    punc = ''.join(punc)
    
        
    print('Data Tidying', file = sys.stderr)
    start = timeit.default_timer()
    L = list(map(processing,lines))
    L = [l for l in L if l != None]

    elapsed = timeit.default_timer() - start
    print('Time elapsed ' + str(round(elapsed/60,2)) + ' minutes', file = sys.stderr)

    print('Export Text list'+'\n', file = sys.stderr)
    text = []
    index = [index for index, value in enumerate(L) if value == '************']
    j = 0
    for i in index:
        text.append(' '.join(L[j:i-1]))   
        j = i+1
    
    collect_letters = []
    for i in text:
        collect_letters.append(list(set(i)))

    normal_letters = list(map(ord, 'abcdefghijklmnopqrstuvwxyz '))

    symbol_ord = []
    for i in range(len(text)):
        symbol_ord += [x for x in list(map(ord,collect_letters[i])) if x not in normal_letters]
    
    symbols = {}
    for i in list(set(temp)):
        symbols.update({i:None})

    Text = []
    for i in text:
        Text.append(i.translate(symbols))

    
    f = open('tidy_' + filename,'w')
    for i in Text:
        f.write(i+'\n')
    f.close()

    print('Completed' + '\n', file = sys.stderr)

Bigram



In [ ]:

    
start = timeit.default_timer()
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
bigrm = [bi for Bi in text_bigram for bi in Bi]
elapsed = timeit.default_timer() - start
print(elapsed)

Count the bigram



In [ ]:

    
bigrm_table = Counter(bigrm)



In [ ]:

    
bigrm_table.most_common()