In [1]:
import sys
import string
import time
import timeit
import re
import nltk
import os
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter
# Stemming
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
# Stopwords
dict_stop = stopwords.words('english')
In [ ]:
def processing(line):
line = line.lower().strip()
line = re.sub(r'<(.*?)>',' ',line)
line = re.sub(r'[\w\d]{20,}','',line)
line = re.sub(r'[\d]','',line)
line = re.sub(r'[.]{1}','',line)
line = re.sub(r'- the new york times','',line)
line = line.translate(dict.fromkeys(map(ord,punc)))
line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('•','').replace('‘','').replace('■','').replace('—','')
words = [word for word in nltk.word_tokenize(line) if word not in dict_stop]
line = ' '.join(list(map(stemmer.stem,words)))
if len(line) != 0:
return(line.strip())
In [ ]:
# Insert directory:
dirname = '/Users/mueric35/Box Sync/nytimes_articles'
for filename in os.listdir(dirname):
print("Loading: %s" % filename,file=sys.stderr)
### Read Data as lines
lines = open(dirname + '/' + filename).readlines()
### Words Processing Pipeline
punc = nltk.word_tokenize(string.punctuation)
punc.remove('*+')
punc = ''.join(punc)
print('Data Tidying', file = sys.stderr)
start = timeit.default_timer()
L = list(map(processing,lines))
L = [l for l in L if l != None]
elapsed = timeit.default_timer() - start
print('Time elapsed ' + str(round(elapsed/60,2)) + ' minutes', file = sys.stderr)
print('Export Text list'+'\n', file = sys.stderr)
text = []
index = [index for index, value in enumerate(L) if value == '************']
j = 0
for i in index:
text.append(' '.join(L[j:i-1]))
j = i+1
collect_letters = []
for i in text:
collect_letters.append(list(set(i)))
normal_letters = list(map(ord, 'abcdefghijklmnopqrstuvwxyz '))
symbol_ord = []
for i in range(len(text)):
symbol_ord += [x for x in list(map(ord,collect_letters[i])) if x not in normal_letters]
symbols = {}
for i in list(set(temp)):
symbols.update({i:None})
Text = []
for i in text:
Text.append(i.translate(symbols))
f = open('tidy_' + filename,'w')
for i in Text:
f.write(i+'\n')
f.close()
print('Completed' + '\n', file = sys.stderr)
In [ ]:
start = timeit.default_timer()
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
bigrm = [bi for Bi in text_bigram for bi in Bi]
elapsed = timeit.default_timer() - start
print(elapsed)
In [ ]:
bigrm_table = Counter(bigrm)
In [ ]:
bigrm_table.most_common()