In [1]:
import sys
import string 
import time
import timeit
import re
import nltk
import os

from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter

# Stemming
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()

# Stopwords
dict_stop = stopwords.words('english')

Text Tidying Pipeline


In [ ]:
def processing(line):
        line = line.lower().strip()
        line = re.sub(r'<(.*?)>',' ',line)
        line = re.sub(r'[\w\d]{20,}','',line)
        line = re.sub(r'[\d]','',line)
        line = re.sub(r'[.]{1}','',line)
        line = re.sub(r'- the new york times','',line)
        line = line.translate(dict.fromkeys(map(ord,punc)))
        line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('•','').replace('‘','').replace('■','').replace('—','')
        words = [word for word in nltk.word_tokenize(line) if word not in dict_stop]
        line = ' '.join(list(map(stemmer.stem,words)))
        if len(line) != 0:
            return(line.strip())

In [ ]:
# Insert directory:
dirname = '/Users/mueric35/Box Sync/nytimes_articles'

for filename in os.listdir(dirname):
    
    print("Loading: %s" % filename,file=sys.stderr)
    
    ### Read Data as lines
    lines = open(dirname + '/' + filename).readlines()
    
    ### Words Processing Pipeline
    punc = nltk.word_tokenize(string.punctuation)
    punc.remove('*+')
    punc = ''.join(punc)
    
        
    print('Data Tidying', file = sys.stderr)
    start = timeit.default_timer()
    L = list(map(processing,lines))
    L = [l for l in L if l != None]

    elapsed = timeit.default_timer() - start
    print('Time elapsed ' + str(round(elapsed/60,2)) + ' minutes', file = sys.stderr)

    print('Export Text list'+'\n', file = sys.stderr)
    text = []
    index = [index for index, value in enumerate(L) if value == '************']
    j = 0
    for i in index:
        text.append(' '.join(L[j:i-1]))   
        j = i+1
    
    collect_letters = []
    for i in text:
        collect_letters.append(list(set(i)))

    normal_letters = list(map(ord, 'abcdefghijklmnopqrstuvwxyz '))

    symbol_ord = []
    for i in range(len(text)):
        symbol_ord += [x for x in list(map(ord,collect_letters[i])) if x not in normal_letters]
    
    symbols = {}
    for i in list(set(temp)):
        symbols.update({i:None})

    Text = []
    for i in text:
        Text.append(i.translate(symbols))

    
    f = open('tidy_' + filename,'w')
    for i in Text:
        f.write(i+'\n')
    f.close()

    print('Completed' + '\n', file = sys.stderr)

Bigram


In [6]:
text = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data/tidy_article_201607.txt'
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))

In [ ]:
start = timeit.default_timer()
text_bigram = list(map(lambda x: list(nltk.bigrams(nltk.word_tokenize(x))),text))
bigrm = [bi for Bi in text_bigram for bi in Bi]
elapsed = timeit.default_timer() - start
print(elapsed)

Count the bigram


In [ ]:
bigrm_table = Counter(bigrm)

In [ ]:
bigrm_table.most_common()

Bigram using nltk


In [2]:
from nltk.collocations import *

In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [4]:
path = '/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/Tidy_data'
os.listdir(path)


Out[4]:
['tidy_article_201607.txt',
 'tidy_article_201608.txt',
 'tidy_article_201609.txt',
 'tidy_article_201610.txt',
 'tidy_article_201611.txt',
 'tidy_article_201612.txt',
 'tidy_article_201701.txt',
 'tidy_article_201702.txt',
 'tidy_article_201703.txt',
 'tidy_article_201704.txt',
 'tidy_article_201705.txt',
 'tidy_article_201706.txt']

In [5]:
#using + to connect each article
text = ''
for loc in os.listdir(path):
    add = path+'/'+loc
    lines = open(path + '/' + loc).readlines()
    new_lines = '+ '.join(lines)
    text = text + '+ '+ new_lines

In [6]:
#tokenize the text
word = nltk.word_tokenize(text)

In [167]:
#construct bigram table
finder = BigramCollocationFinder.from_words(word)

In [168]:
#filter bigram containing + since we do not want to construct bigram across two different articles
finder.apply_word_filter(lambda w: w == '+')

In [117]:
#below is the option: filter lower frequency bigram
#finder.apply_freq_filter(50)

In [176]:
#scoring each bigram by computing point mutual information
scored = finder.score_ngrams(bigram_measures.pmi)

In [170]:
#sort raw frequency table
sorted_freq = sorted(finder.ngram_fd.items(),key = lambda t: (-t[1],t[0]))

In [177]:
#turn score list to dictionary
scored_dict = dict(scored)

In [180]:
scored[1:10]


Out[180]:
[(('aakaash', 'israni'), 24.601038623241983),
 (('aamodt', 'kild'), 24.601038623241983),
 (('aaronel', 'deroy'), 24.601038623241983),
 (('aatish', 'taseer'), 24.601038623241983),
 (('abdelkhad', 'elmouaziz'), 24.601038623241983),
 (('abdulsattar', 'alhabu'), 24.601038623241983),
 (('abdussam', 'ambzhthing'), 24.601038623241983),
 (('abdygani', 'shakirov'), 24.601038623241983),
 (('abenaa', 'afrakoma'), 24.601038623241983)]

In [175]:
sorted_freq[0:10]


Out[175]:
[(('mr', 'trump'), 55049),
 (('new', 'york'), 46078),
 (('unit', 'state'), 35635),
 (('last', 'year'), 15741),
 (('white', 'hous'), 11798),
 (('york', 'time'), 11144),
 (('said', 'mr'), 11022),
 (('mr', 'clinton'), 11018),
 (('year', 'ago'), 10195),
 (('last', 'week'), 8100)]

Trigram using nltk


In [7]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [8]:
finder_tri = TrigramCollocationFinder.from_words(word)

In [9]:
finder_tri.apply_word_filter(lambda w: w == '+')

In [11]:
scored_tri = finder_tri.score_ngrams(trigram_measures.pmi)

In [ ]:
sorted_freq_tri = sorted(finder_tri.ngram_fd.items(),key = lambda t: (-t[1],t[0]))

In [ ]:
scored_dict_tri = dict(scored_tri)

In [ ]:
scored_tri[1:10]

In [ ]:
sorted_freq_tri[0:10]