In [1]:

    
import sys
import string 
import time
import timeit
import re
import nltk
import os

from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter

# Stemming
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()

# Stopwords
dict_stop = stopwords.words('english')

Text Tidying Pipeline



In [ ]:

    
def processing(line):
        line = line.lower().strip()
        line = re.sub(r'<(.*?)>',' ',line)
        line = re.sub(r'[\w\d]{20,}','',line)
        line = re.sub(r'[\d]','',line)
        line = re.sub(r'[.]{1}','',line)
        line = re.sub(r'- the new york times','',line)
        line = line.translate(dict.fromkeys(map(ord,punc)))
        line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('•','').replace('‘','').replace('■','').replace('—','')
        words = [word for word in nltk.word_tokenize(line) if word not in dict_stop]
        line = ' '.join(list(map(stemmer.stem,words)))
        if len(line) != 0:
            return(line.strip())



In [ ]:

    
# Insert directory:
dirname = '/Users/mueric35/Box Sync/nytimes_articles'

for filename in os.listdir(dirname):
    
    print("Loading: %s" % filename,file=sys.stderr)
    
    ### Read Data as lines
    lines = open(dirname + '/' + filename).readlines()
    
    ### Words Processing Pipeline
    punc = nltk.word_tokenize(string.punctuation)
    punc.remove('*+')
    punc = ''.join(punc)
    
        
    print('Data Tidying', file = sys.stderr)
    start = timeit.default_timer()
    L = list(map(processing,lines))
    L = [l for l in L if l != None]

    elapsed = timeit.default_timer() - start
    print('Time elapsed ' + str(round(elapsed/60,2)) + ' minutes', file = sys.stderr)

    print('Export Text list'+'\n', file = sys.stderr)
    text = []
    index = [index for index, value in enumerate(L) if value == '************']
    j = 0
    for i in index:
        text.append(' '.join(L[j:i-1]))   
        j = i+1
    
    collect_letters = []
    for i in text:
        collect_letters.append(list(set(i)))

    normal_letters = list(map(ord, 'abcdefghijklmnopqrstuvwxyz '))

    symbol_ord = []
    for i in range(len(text)):
        symbol_ord += [x for x in list(map(ord,collect_letters[i])) if x not in normal_letters]
    
    symbols = {}
    for i in list(set(temp)):
        symbols.update({i:None})

    Text = []
    for i in text:
        Text.append(i.translate(symbols))

    
    f = open('tidy_' + filename,'w')
    for i in Text:
        f.write(i+'\n')
    f.close()

    print('Completed' + '\n', file = sys.stderr)

Bigram



In [6]:

    
text = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data/tidy_article_201607.txt'
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))



In [ ]:

    
start = timeit.default_timer()
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
bigrm = [bi for Bi in text_bigram for bi in Bi]
elapsed = timeit.default_timer() - start
print(elapsed)

Count the bigram



In [ ]:

    
bigrm_table = Counter(bigrm)



In [ ]:

    
bigrm_table.most_common()

Bigram using nltk



In [2]:

    
from nltk.collocations import *



In [3]:

    
bigram_measures = nltk.collocations.BigramAssocMeasures()



In [4]:

    
path = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data'
os.listdir(path)









    Out[4]:





['tidy_article_201607.txt',
 'tidy_article_201608.txt',
 'tidy_article_201609.txt',
 'tidy_article_201610.txt',
 'tidy_article_201611.txt',
 'tidy_article_201612.txt',
 'tidy_article_201701.txt',
 'tidy_article_201702.txt',
 'tidy_article_201703.txt',
 'tidy_article_201704.txt',
 'tidy_article_201705.txt',
 'tidy_article_201706.txt']



In [5]:

    
#using + to connect each article
text = ''
for loc in os.listdir(path):
    add = path+'/'+loc
    lines = open(path + '/' + loc).readlines()
    new_lines = '+ '.join(lines)
    text = text + '+ '+ new_lines



In [6]:

    
#tokenize the text
word = nltk.word_tokenize(text)



In [167]:

    
#construct bigram table
finder = BigramCollocationFinder.from_words(word)



In [168]:

    
#filter bigram containing + since we do not want to construct bigram across two different articles
finder.apply_word_filter(lambda w: w == '+')



In [117]:

    
#below is the option: filter lower frequency bigram
#finder.apply_freq_filter(50)



In [176]:

    
#scoring each bigram by computing point mutual information
scored = finder.score_ngrams(bigram_measures.pmi)



In [170]:

    
#sort raw frequency table
sorted_freq = sorted(finder.ngram_fd.items(),key = lambda t: (-t[1],t[0]))



In [177]:

    
#turn score list to dictionary
scored_dict = dict(scored)



In [180]:

    
scored[1:10]









    Out[180]:





[(('aakaash', 'israni'), 24.601038623241983),
 (('aamodt', 'kild'), 24.601038623241983),
 (('aaronel', 'deroy'), 24.601038623241983),
 (('aatish', 'taseer'), 24.601038623241983),
 (('abdelkhad', 'elmouaziz'), 24.601038623241983),
 (('abdulsattar', 'alhabu'), 24.601038623241983),
 (('abdussam', 'ambzhthing'), 24.601038623241983),
 (('abdygani', 'shakirov'), 24.601038623241983),
 (('abenaa', 'afrakoma'), 24.601038623241983)]



In [175]:

    
sorted_freq[0:10]









    Out[175]:





[(('mr', 'trump'), 55049),
 (('new', 'york'), 46078),
 (('unit', 'state'), 35635),
 (('last', 'year'), 15741),
 (('white', 'hous'), 11798),
 (('york', 'time'), 11144),
 (('said', 'mr'), 11022),
 (('mr', 'clinton'), 11018),
 (('year', 'ago'), 10195),
 (('last', 'week'), 8100)]

Source:http://www.nltk.org/howto/collocations.html

Trigram using nltk



In [7]:

    
trigram_measures = nltk.collocations.TrigramAssocMeasures()



In [8]:

    
finder_tri = TrigramCollocationFinder.from_words(word)



In [9]:

    
finder_tri.apply_word_filter(lambda w: w == '+')



In [11]:

    
scored_tri = finder_tri.score_ngrams(trigram_measures.pmi)



In [ ]:

    
sorted_freq_tri = sorted(finder_tri.ngram_fd.items(),key = lambda t: (-t[1],t[0]))



In [ ]:

    
scored_dict_tri = dict(scored_tri)



In [ ]:

    
scored_tri[1:10]



In [ ]:

    
sorted_freq_tri[0:10]