In [1]:
import sys
import string
import time
import timeit
import re
import nltk
import os
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter
# Stemming
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
# Stopwords
dict_stop = stopwords.words('english')
In [ ]:
def processing(line):
line = line.lower().strip()
line = re.sub(r'<(.*?)>',' ',line)
line = re.sub(r'[\w\d]{20,}','',line)
line = re.sub(r'[\d]','',line)
line = re.sub(r'[.]{1}','',line)
line = re.sub(r'- the new york times','',line)
line = line.translate(dict.fromkeys(map(ord,punc)))
line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('•','').replace('‘','').replace('■','').replace('—','')
words = [word for word in nltk.word_tokenize(line) if word not in dict_stop]
line = ' '.join(list(map(stemmer.stem,words)))
if len(line) != 0:
return(line.strip())
In [ ]:
# Insert directory:
dirname = '/Users/mueric35/Box Sync/nytimes_articles'
for filename in os.listdir(dirname):
print("Loading: %s" % filename,file=sys.stderr)
### Read Data as lines
lines = open(dirname + '/' + filename).readlines()
### Words Processing Pipeline
punc = nltk.word_tokenize(string.punctuation)
punc.remove('*+')
punc = ''.join(punc)
print('Data Tidying', file = sys.stderr)
start = timeit.default_timer()
L = list(map(processing,lines))
L = [l for l in L if l != None]
elapsed = timeit.default_timer() - start
print('Time elapsed ' + str(round(elapsed/60,2)) + ' minutes', file = sys.stderr)
print('Export Text list'+'\n', file = sys.stderr)
text = []
index = [index for index, value in enumerate(L) if value == '************']
j = 0
for i in index:
text.append(' '.join(L[j:i-1]))
j = i+1
collect_letters = []
for i in text:
collect_letters.append(list(set(i)))
normal_letters = list(map(ord, 'abcdefghijklmnopqrstuvwxyz '))
symbol_ord = []
for i in range(len(text)):
symbol_ord += [x for x in list(map(ord,collect_letters[i])) if x not in normal_letters]
symbols = {}
for i in list(set(temp)):
symbols.update({i:None})
Text = []
for i in text:
Text.append(i.translate(symbols))
f = open('tidy_' + filename,'w')
for i in Text:
f.write(i+'\n')
f.close()
print('Completed' + '\n', file = sys.stderr)
In [6]:
text = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data/tidy_article_201607.txt'
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
In [ ]:
start = timeit.default_timer()
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
bigrm = [bi for Bi in text_bigram for bi in Bi]
elapsed = timeit.default_timer() - start
print(elapsed)
In [ ]:
bigrm_table = Counter(bigrm)
In [ ]:
bigrm_table.most_common()
In [2]:
from nltk.collocations import *
In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
In [4]:
path = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data'
os.listdir(path)
Out[4]:
In [5]:
#using + to connect each article
text = ''
for loc in os.listdir(path):
add = path+'/'+loc
lines = open(path + '/' + loc).readlines()
new_lines = '+ '.join(lines)
text = text + '+ '+ new_lines
In [6]:
#tokenize the text
word = nltk.word_tokenize(text)
In [167]:
#construct bigram table
finder = BigramCollocationFinder.from_words(word)
In [168]:
#filter bigram containing + since we do not want to construct bigram across two different articles
finder.apply_word_filter(lambda w: w == '+')
In [117]:
#below is the option: filter lower frequency bigram
#finder.apply_freq_filter(50)
In [176]:
#scoring each bigram by computing point mutual information
scored = finder.score_ngrams(bigram_measures.pmi)
In [170]:
#sort raw frequency table
sorted_freq = sorted(finder.ngram_fd.items(),key = lambda t: (-t[1],t[0]))
In [177]:
#turn score list to dictionary
scored_dict = dict(scored)
In [180]:
scored[1:10]
Out[180]:
In [175]:
sorted_freq[0:10]
Out[175]:
In [7]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
In [8]:
finder_tri = TrigramCollocationFinder.from_words(word)
In [9]:
finder_tri.apply_word_filter(lambda w: w == '+')
In [11]:
scored_tri = finder_tri.score_ngrams(trigram_measures.pmi)
In [ ]:
sorted_freq_tri = sorted(finder_tri.ngram_fd.items(),key = lambda t: (-t[1],t[0]))
In [ ]:
scored_dict_tri = dict(scored_tri)
In [ ]:
scored_tri[1:10]
In [ ]:
sorted_freq_tri[0:10]