In [58]:
import psycopg2
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import re
from __future__ import division
from nltk.tag import StanfordNERTagger

In [59]:
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-35-163-99-253.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
df = pd.read_sql_query("SELECT * FROM articles limit 5", conn)

In [60]:
df


Out[60]:
site title author secondary_authors published_on accessed_on url body html newspaper_keywords newspaper_summary id
0 USAToday Seahawks looking at Colin Kaepernick, Robert G... Michael Middlehurst-Schwartz ['P.M. Et May'] 2017-05-15 2017-05-16 10:04:25.859536 http://www.usatoday.com/story/sports/nfl/2017/... CLOSE Skip in Skip x Embed x Share Colin Kaepe... <div><p class="js-video-placeholder video-plac... {backup,iii,robert,team,x,seahawks,looking,tod... CLOSE Skip in Skip x Embed x Share Colin Kaepe... 64766
1 USAToday LaVar Ball shed light on telling Lonzo about h... Andrew Joseph 2017-05-16 2017-05-16 10:04:32.185578 http://ftw.usatoday.com/2017/05/lavar-ball-tin... When the UCLA Bruins were in the heart of Pac-... <div><p>When the UCLA Bruins were in the heart... {telling,son,text,ucla,light,stroke,sons,ball,... When the UCLA Bruins were in the heart of Pac-... 64767
2 USAToday USC's tab for firing Lane Kiffin rose to $6 mi... Steve Berkowitz ['Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:38.461586 http://www.usatoday.com/story/sports/ncaaf/201... CLOSE Skip in Skip x Embed x Share The college... <div><p class="js-video-placeholder video-plac... {2015,firing,million,kiffin,school,total,retur... (Photo: Matt Kartozian, USA TODAY Sports)The U... 64768
3 USAToday 'Dancing with the Stars:' Simone Biles goes ho... Justin Kirkland ['Special To Usa Today', 'Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:41.895141 http://www.usatoday.com/story/life/entertainth... There's something about Dancing with the Stars... <div><p id="module-position-P9JlHC7Wa4I" class... {david,perfect,goes,dancing,rumba,ross,challen... There's something about Dancing with the Stars... 64769
4 USAToday Conservative media not sold on story of Trump ... William Cummings ['Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:52.081025 http://www.usatoday.com/story/news/politics/on... CLOSE Skip in Skip x Embed x Share A bombshell... <div><p class="js-video-placeholder video-plac... {president,sources,youre,report,info,headline,... Just under an hour later, Fox News ran a banne... 64770

Tokenize the article body


In [61]:
tokenized_body = []
for body in df['body']:
    body = body.decode('utf-8')
    tokens = nltk.word_tokenize(body)
    tokenized_body.append(tokens)

In [62]:
se = pd.Series(tokenized_body)
df['tokenized_body'] = se.values

Simple word count


In [63]:
word_count = []
for body in df['tokenized_body']:
    word_count.append(len(body))

In [64]:
se = pd.Series(word_count)
df['word_count'] = se.values

Stopword Removal


In [65]:
stop_words = stopwords.words('english')
stop_words = stop_words + [',', '.', '!', '?', '"','\'', '/', '\\', '-', '--', '—', '(', ')', '[', ']', '\'s', '\'t', '\'ve', '\'d', '\'ll', '\'re']
stop_words = set(stop_words) # making this a set increases performance for large documents

In [66]:
stopworded_body = []
for body in df['tokenized_body']:
    stopworded_body.append([w.lower() for w in body if w not in stop_words])

In [67]:
se = pd.Series(stopworded_body)
df['stopworded_body'] = se.values

Lemmatization: Get the root words for the tokenized and stopworded body text


In [68]:
wnl = nltk.WordNetLemmatizer()
lemmatized_words = []
lemmatized_body = []
for body in df['stopworded_body']:
    # We need to tag words with their parts of speech before the WordNet lemmatizer will work properly
    pos_tagged_body = nltk.pos_tag(body)
    lemmatized_words = []
    for word, tag in pos_tagged_body:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        lemmatized_words.append(lemma)
    lemmatized_body.append(lemmatized_words)

In [69]:
se = pd.Series(lemmatized_body)
df['lemmatized_body'] = se.values

Bag of Words/Frequency Distribution: Get word count from lemmatized text


In [70]:
word_bag = []
for body in df['lemmatized_body']:
    fdist = FreqDist(body)
    # FreqDist returns a special nltk.probability.FreqDist type
    # This is a list of tuples
    # Here is an example of how to access the elements for future reference
#     print(fdist.most_common())
    # Access an individual tuple
#     print(fdist.most_common()[0])
    # Access the word from the tuple
#     print(fdist.most_common()[0][0])
    # Access the count from the tuple
#     print(fdist.most_common()[0][1])
    # Append to list as ordered frequency distribution
    word_bag.append(fdist.most_common())

In [71]:
se = pd.Series(word_bag)
df['word_bag'] = se.values

Named Entity Extraction using StanfordNLP Classification Model

Stanford NLP named entity extractor requires that you download the jar from https://nlp.stanford.edu/software/CRF-NER.shtml#Download, unzip and extract english.all.3class.distsim.crf.ser.gz and stanford-ner.jar, then provide their file paths to StanfordNERTagger below and you may need to install java8 on ubuntu: https://tecadmin.net/install-oracle-java-8-ubuntu-via-ppa/

In [72]:
st = StanfordNERTagger('/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/english.all.3class.distsim.crf.ser.gz',
					   '/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/stanford-ner.jar',
					   encoding='utf-8')

In [73]:
classified_texts = []
for body in df['tokenized_body']:
    classified_texts.append(st.tag(body))

# print(classified_text)

Now, if we want to parse the list of tuples returned by the standford classifier into a more easily usable list form, we can take that output, convert it to the standard IOB tag format with stanfordNE2BIO, then parse that into a tree, and traverse the tree to rearrange into a list


In [74]:
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent

Now convert the IOB tagged tuples into a tree (this can be called with the original stanfordNERTagger output, skipping the explicit call to convert to IOB format)


In [75]:
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

In [76]:
ne_trees = []
for text in classified_texts:
    ne_trees.append(stanfordNE2tree(text))

Finally, join the leaves into a formated list of tuples


In [77]:
ne_in_sent = []
ne_in_sents = []
for tree in ne_trees:
    ne_in_sent = []
    for subtree in tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    ne_in_sents.append(ne_in_sent)

In [78]:
se = pd.Series(ne_in_sents)
df['named_entities'] = se.values

Lexical diversity is a measure of the complexity, or sophistication, of a text. A higher number means the text has a richer vocabulary and less repetition of words. If the calculation returns 65.23, for example, that means 65.23% of the total words are distinct.


In [79]:
def lexical_diversity(text):
    return len(set(text)) / len(text) * 100

In [80]:
lex_div = []
for body in df['stopworded_body']:
    lex_div.append(lexical_diversity(body))
    print("lexical diversity: " + str(lexical_diversity(body)))


lexical diversity: 75.0
lexical diversity: 68.9119170984
lexical diversity: 59.3360995851
lexical diversity: 68.3918669131
lexical diversity: 55.6541019956

In [81]:
se = pd.Series(lex_div)
df['lexical_diversity'] = se.values

In [82]:
df


Out[82]:
site title author secondary_authors published_on accessed_on url body html newspaper_keywords newspaper_summary id tokenized_body word_count stopworded_body lemmatized_body word_bag named_entities lexical_diversity
0 USAToday Seahawks looking at Colin Kaepernick, Robert G... Michael Middlehurst-Schwartz ['P.M. Et May'] 2017-05-15 2017-05-16 10:04:25.859536 http://www.usatoday.com/story/sports/nfl/2017/... CLOSE Skip in Skip x Embed x Share Colin Kaepe... <div><p class="js-video-placeholder video-plac... {backup,iii,robert,team,x,seahawks,looking,tod... CLOSE Skip in Skip x Embed x Share Colin Kaepe... 64766 [CLOSE, Skip, in, Skip, x, Embed, x, Share, Co... 376 [close, skip, skip, x, embed, x, share, colin,... [close, skip, skip, x, embed, x, share, colin,... [(kaepernick, 6), (seahawks, 6), (quarterback,... [(Colin Kaepernick, PERSON), (NFL, ORGANIZATIO... 75.000000
1 USAToday LaVar Ball shed light on telling Lonzo about h... Andrew Joseph 2017-05-16 2017-05-16 10:04:32.185578 http://ftw.usatoday.com/2017/05/lavar-ball-tin... When the UCLA Bruins were in the heart of Pac-... <div><p>When the UCLA Bruins were in the heart... {telling,son,text,ucla,light,stroke,sons,ball,... When the UCLA Bruins were in the heart of Pac-... 64767 [When, the, UCLA, Bruins, were, in, the, heart... 375 [when, ucla, bruins, heart, pac-12, play, lonz... [when, ucla, bruin, heart, pac-12, play, lonzo... [(lonzo, 8), (lavar, 5), (tell, 5), (text, 4),... [(UCLA Bruins, ORGANIZATION), (Lonzo Ball, PER... 68.911917
2 USAToday USC's tab for firing Lane Kiffin rose to $6 mi... Steve Berkowitz ['Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:38.461586 http://www.usatoday.com/story/sports/ncaaf/201... CLOSE Skip in Skip x Embed x Share The college... <div><p class="js-video-placeholder video-plac... {2015,firing,million,kiffin,school,total,retur... (Photo: Matt Kartozian, USA TODAY Sports)The U... 64768 [CLOSE, Skip, in, Skip, x, Embed, x, Share, Th... 820 [close, skip, skip, x, embed, x, share, the, c... [close, skip, skip, x, embed, x, share, the, c... [($, 19), (year, 13), (million, 12), (pay, 11)... [(USA, LOCATION), (USC Trojans, ORGANIZATION),... 59.336100
3 USAToday 'Dancing with the Stars:' Simone Biles goes ho... Justin Kirkland ['Special To Usa Today', 'Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:41.895141 http://www.usatoday.com/story/life/entertainth... There's something about Dancing with the Stars... <div><p id="module-position-P9JlHC7Wa4I" class... {david,perfect,goes,dancing,rumba,ross,challen... There's something about Dancing with the Stars... 64769 [There, 's, something, about, Dancing, with, t... 923 [there, something, dancing, stars, semi-finals... [there, something, dance, star, semi-finals, w... [(:, 14), (simone, 9), (dance, 8), (david, 8),... [(David Ross, PERSON), (Chmerkovskiy, PERSON),... 68.391867
4 USAToday Conservative media not sold on story of Trump ... William Cummings ['Published P.M. Et May'] 2017-05-15 2017-05-16 10:04:52.081025 http://www.usatoday.com/story/news/politics/on... CLOSE Skip in Skip x Embed x Share A bombshell... <div><p class="js-video-placeholder video-plac... {president,sources,youre,report,info,headline,... Just under an hour later, Fox News ran a banne... 64770 [CLOSE, Skip, in, Skip, x, Embed, x, Share, A,... 708 [close, skip, skip, x, embed, x, share, a, bom... [close, skip, skip, x, embed, x, share, a, bom... [(trump, 14), (:, 13), (``, 13), ('', 13), (st... [(Washington Post, ORGANIZATION), (USA, LOCATI... 55.654102

TF-IDF


In [ ]: