In [58]:

    
import psycopg2
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import re
from __future__ import division
from nltk.tag import StanfordNERTagger



In [59]:

    
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-35-163-99-253.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
df = pd.read_sql_query("SELECT * FROM articles limit 5", conn)



In [60]:

    
df









    Out[60]:






  
    
      
      site
      title
      author
      secondary_authors
      published_on
      accessed_on
      url
      body
      html
      newspaper_keywords
      newspaper_summary
      id
    
  
  
    
      0
      USAToday
      Seahawks looking at Colin Kaepernick, Robert G...
      Michael Middlehurst-Schwartz
      ['P.M. Et May']
      2017-05-15
      2017-05-16 10:04:25.859536
      http://www.usatoday.com/story/sports/nfl/2017/...
      CLOSE Skip in Skip x Embed x Share Colin Kaepe...
      <div><p class="js-video-placeholder video-plac...
      {backup,iii,robert,team,x,seahawks,looking,tod...
      CLOSE Skip in Skip x Embed x Share Colin Kaepe...
      64766
    
    
      1
      USAToday
      LaVar Ball shed light on telling Lonzo about h...
      Andrew Joseph
      
      2017-05-16
      2017-05-16 10:04:32.185578
      http://ftw.usatoday.com/2017/05/lavar-ball-tin...
      When the UCLA Bruins were in the heart of Pac-...
      <div><p>When the UCLA Bruins were in the heart...
      {telling,son,text,ucla,light,stroke,sons,ball,...
      When the UCLA Bruins were in the heart of Pac-...
      64767
    
    
      2
      USAToday
      USC's tab for firing Lane Kiffin rose to $6 mi...
      Steve Berkowitz
      ['Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:38.461586
      http://www.usatoday.com/story/sports/ncaaf/201...
      CLOSE Skip in Skip x Embed x Share The college...
      <div><p class="js-video-placeholder video-plac...
      {2015,firing,million,kiffin,school,total,retur...
      (Photo: Matt Kartozian, USA TODAY Sports)The U...
      64768
    
    
      3
      USAToday
      'Dancing with the Stars:' Simone Biles goes ho...
      Justin Kirkland
      ['Special To Usa Today', 'Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:41.895141
      http://www.usatoday.com/story/life/entertainth...
      There's something about Dancing with the Stars...
      <div><p id="module-position-P9JlHC7Wa4I" class...
      {david,perfect,goes,dancing,rumba,ross,challen...
      There's something about Dancing with the Stars...
      64769
    
    
      4
      USAToday
      Conservative media not sold on story of Trump ...
      William Cummings
      ['Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:52.081025
      http://www.usatoday.com/story/news/politics/on...
      CLOSE Skip in Skip x Embed x Share A bombshell...
      <div><p class="js-video-placeholder video-plac...
      {president,sources,youre,report,info,headline,...
      Just under an hour later, Fox News ran a banne...
      64770

Tokenize the article body



In [61]:

    
tokenized_body = []
for body in df['body']:
    body = body.decode('utf-8')
    tokens = nltk.word_tokenize(body)
    tokenized_body.append(tokens)



In [62]:

    
se = pd.Series(tokenized_body)
df['tokenized_body'] = se.values

Simple word count



In [63]:

    
word_count = []
for body in df['tokenized_body']:
    word_count.append(len(body))



In [64]:

    
se = pd.Series(word_count)
df['word_count'] = se.values

Stopword Removal



In [65]:

    
stop_words = stopwords.words('english')
stop_words = stop_words + [',', '.', '!', '?', '"','\'', '/', '\\', '-', '--', '—', '(', ')', '[', ']', '\'s', '\'t', '\'ve', '\'d', '\'ll', '\'re']
stop_words = set(stop_words) # making this a set increases performance for large documents



In [66]:

    
stopworded_body = []
for body in df['tokenized_body']:
    stopworded_body.append([w.lower() for w in body if w not in stop_words])



In [67]:

    
se = pd.Series(stopworded_body)
df['stopworded_body'] = se.values

Lemmatization: Get the root words for the tokenized and stopworded body text



In [68]:

    
wnl = nltk.WordNetLemmatizer()
lemmatized_words = []
lemmatized_body = []
for body in df['stopworded_body']:
    # We need to tag words with their parts of speech before the WordNet lemmatizer will work properly
    pos_tagged_body = nltk.pos_tag(body)
    lemmatized_words = []
    for word, tag in pos_tagged_body:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        lemmatized_words.append(lemma)
    lemmatized_body.append(lemmatized_words)



In [69]:

    
se = pd.Series(lemmatized_body)
df['lemmatized_body'] = se.values

Bag of Words/Frequency Distribution: Get word count from lemmatized text



In [70]:

    
word_bag = []
for body in df['lemmatized_body']:
    fdist = FreqDist(body)
    # FreqDist returns a special nltk.probability.FreqDist type
    # This is a list of tuples
    # Here is an example of how to access the elements for future reference
#     print(fdist.most_common())
    # Access an individual tuple
#     print(fdist.most_common()[0])
    # Access the word from the tuple
#     print(fdist.most_common()[0][0])
    # Access the count from the tuple
#     print(fdist.most_common()[0][1])
    # Append to list as ordered frequency distribution
    word_bag.append(fdist.most_common())



In [71]:

    
se = pd.Series(word_bag)
df['word_bag'] = se.values

Named Entity Extraction using StanfordNLP Classification Model

Stanford NLP named entity extractor requires that you download the jar from https://nlp.stanford.edu/software/CRF-NER.shtml#Download, unzip and extract english.all.3class.distsim.crf.ser.gz and stanford-ner.jar, then provide their file paths to StanfordNERTagger below and you may need to install java8 on ubuntu: https://tecadmin.net/install-oracle-java-8-ubuntu-via-ppa/



In [72]:

    
st = StanfordNERTagger('/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/english.all.3class.distsim.crf.ser.gz',
					   '/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/stanford-ner.jar',
					   encoding='utf-8')



In [73]:

    
classified_texts = []
for body in df['tokenized_body']:
    classified_texts.append(st.tag(body))

# print(classified_text)

Now, if we want to parse the list of tuples returned by the standford classifier into a more easily usable list form, we can take that output, convert it to the standard IOB tag format with stanfordNE2BIO, then parse that into a tree, and traverse the tree to rearrange into a list



In [74]:

    
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent

Now convert the IOB tagged tuples into a tree (this can be called with the original stanfordNERTagger output, skipping the explicit call to convert to IOB format)



In [75]:

    
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree



In [76]:

    
ne_trees = []
for text in classified_texts:
    ne_trees.append(stanfordNE2tree(text))

Finally, join the leaves into a formated list of tuples



In [77]:

    
ne_in_sent = []
ne_in_sents = []
for tree in ne_trees:
    ne_in_sent = []
    for subtree in tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    ne_in_sents.append(ne_in_sent)



In [78]:

    
se = pd.Series(ne_in_sents)
df['named_entities'] = se.values

Lexical diversity is a measure of the complexity, or sophistication, of a text. A higher number means the text has a richer vocabulary and less repetition of words. If the calculation returns 65.23, for example, that means 65.23% of the total words are distinct.



In [79]:

    
def lexical_diversity(text):
    return len(set(text)) / len(text) * 100



In [80]:

    
lex_div = []
for body in df['stopworded_body']:
    lex_div.append(lexical_diversity(body))
    print("lexical diversity: " + str(lexical_diversity(body)))









    



lexical diversity: 75.0
lexical diversity: 68.9119170984
lexical diversity: 59.3360995851
lexical diversity: 68.3918669131
lexical diversity: 55.6541019956



In [81]:

    
se = pd.Series(lex_div)
df['lexical_diversity'] = se.values



In [82]:

    
df









    Out[82]:






  
    
      
      site
      title
      author
      secondary_authors
      published_on
      accessed_on
      url
      body
      html
      newspaper_keywords
      newspaper_summary
      id
      tokenized_body
      word_count
      stopworded_body
      lemmatized_body
      word_bag
      named_entities
      lexical_diversity
    
  
  
    
      0
      USAToday
      Seahawks looking at Colin Kaepernick, Robert G...
      Michael Middlehurst-Schwartz
      ['P.M. Et May']
      2017-05-15
      2017-05-16 10:04:25.859536
      http://www.usatoday.com/story/sports/nfl/2017/...
      CLOSE Skip in Skip x Embed x Share Colin Kaepe...
      <div><p class="js-video-placeholder video-plac...
      {backup,iii,robert,team,x,seahawks,looking,tod...
      CLOSE Skip in Skip x Embed x Share Colin Kaepe...
      64766
      [CLOSE, Skip, in, Skip, x, Embed, x, Share, Co...
      376
      [close, skip, skip, x, embed, x, share, colin,...
      [close, skip, skip, x, embed, x, share, colin,...
      [(kaepernick, 6), (seahawks, 6), (quarterback,...
      [(Colin Kaepernick, PERSON), (NFL, ORGANIZATIO...
      75.000000
    
    
      1
      USAToday
      LaVar Ball shed light on telling Lonzo about h...
      Andrew Joseph
      
      2017-05-16
      2017-05-16 10:04:32.185578
      http://ftw.usatoday.com/2017/05/lavar-ball-tin...
      When the UCLA Bruins were in the heart of Pac-...
      <div><p>When the UCLA Bruins were in the heart...
      {telling,son,text,ucla,light,stroke,sons,ball,...
      When the UCLA Bruins were in the heart of Pac-...
      64767
      [When, the, UCLA, Bruins, were, in, the, heart...
      375
      [when, ucla, bruins, heart, pac-12, play, lonz...
      [when, ucla, bruin, heart, pac-12, play, lonzo...
      [(lonzo, 8), (lavar, 5), (tell, 5), (text, 4),...
      [(UCLA Bruins, ORGANIZATION), (Lonzo Ball, PER...
      68.911917
    
    
      2
      USAToday
      USC's tab for firing Lane Kiffin rose to $6 mi...
      Steve Berkowitz
      ['Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:38.461586
      http://www.usatoday.com/story/sports/ncaaf/201...
      CLOSE Skip in Skip x Embed x Share The college...
      <div><p class="js-video-placeholder video-plac...
      {2015,firing,million,kiffin,school,total,retur...
      (Photo: Matt Kartozian, USA TODAY Sports)The U...
      64768
      [CLOSE, Skip, in, Skip, x, Embed, x, Share, Th...
      820
      [close, skip, skip, x, embed, x, share, the, c...
      [close, skip, skip, x, embed, x, share, the, c...
      [($, 19), (year, 13), (million, 12), (pay, 11)...
      [(USA, LOCATION), (USC Trojans, ORGANIZATION),...
      59.336100
    
    
      3
      USAToday
      'Dancing with the Stars:' Simone Biles goes ho...
      Justin Kirkland
      ['Special To Usa Today', 'Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:41.895141
      http://www.usatoday.com/story/life/entertainth...
      There's something about Dancing with the Stars...
      <div><p id="module-position-P9JlHC7Wa4I" class...
      {david,perfect,goes,dancing,rumba,ross,challen...
      There's something about Dancing with the Stars...
      64769
      [There, 's, something, about, Dancing, with, t...
      923
      [there, something, dancing, stars, semi-finals...
      [there, something, dance, star, semi-finals, w...
      [(:, 14), (simone, 9), (dance, 8), (david, 8),...
      [(David Ross, PERSON), (Chmerkovskiy, PERSON),...
      68.391867
    
    
      4
      USAToday
      Conservative media not sold on story of Trump ...
      William Cummings
      ['Published P.M. Et May']
      2017-05-15
      2017-05-16 10:04:52.081025
      http://www.usatoday.com/story/news/politics/on...
      CLOSE Skip in Skip x Embed x Share A bombshell...
      <div><p class="js-video-placeholder video-plac...
      {president,sources,youre,report,info,headline,...
      Just under an hour later, Fox News ran a banne...
      64770
      [CLOSE, Skip, in, Skip, x, Embed, x, Share, A,...
      708
      [close, skip, skip, x, embed, x, share, a, bom...
      [close, skip, skip, x, embed, x, share, a, bom...
      [(trump, 14), (:, 13), (``, 13), ('', 13), (st...
      [(Washington Post, ORGANIZATION), (USA, LOCATI...
      55.654102

TF-IDF



In [ ]:

	site	title	author	secondary_authors	published_on	accessed_on	url	body	html	newspaper_keywords	newspaper_summary	id
0	USAToday	Seahawks looking at Colin Kaepernick, Robert G...	Michael Middlehurst-Schwartz	['P.M. Et May']	2017-05-15	2017-05-16 10:04:25.859536	http://www.usatoday.com/story/sports/nfl/2017/...	CLOSE Skip in Skip x Embed x Share Colin Kaepe...	<div><p class="js-video-placeholder video-plac...	{backup,iii,robert,team,x,seahawks,looking,tod...	CLOSE Skip in Skip x Embed x Share Colin Kaepe...	64766
1	USAToday	LaVar Ball shed light on telling Lonzo about h...	Andrew Joseph		2017-05-16	2017-05-16 10:04:32.185578	http://ftw.usatoday.com/2017/05/lavar-ball-tin...	When the UCLA Bruins were in the heart of Pac-...	<div><p>When the UCLA Bruins were in the heart...	{telling,son,text,ucla,light,stroke,sons,ball,...	When the UCLA Bruins were in the heart of Pac-...	64767
2	USAToday	USC's tab for firing Lane Kiffin rose to $6 mi...	Steve Berkowitz	['Published P.M. Et May']	2017-05-15	2017-05-16 10:04:38.461586	http://www.usatoday.com/story/sports/ncaaf/201...	CLOSE Skip in Skip x Embed x Share The college...	<div><p class="js-video-placeholder video-plac...	{2015,firing,million,kiffin,school,total,retur...	(Photo: Matt Kartozian, USA TODAY Sports)The U...	64768
3	USAToday	'Dancing with the Stars:' Simone Biles goes ho...	Justin Kirkland	['Special To Usa Today', 'Published P.M. Et May']	2017-05-15	2017-05-16 10:04:41.895141	http://www.usatoday.com/story/life/entertainth...	There's something about Dancing with the Stars...	<div><p id="module-position-P9JlHC7Wa4I" class...	{david,perfect,goes,dancing,rumba,ross,challen...	There's something about Dancing with the Stars...	64769
4	USAToday	Conservative media not sold on story of Trump ...	William Cummings	['Published P.M. Et May']	2017-05-15	2017-05-16 10:04:52.081025	http://www.usatoday.com/story/news/politics/on...	CLOSE Skip in Skip x Embed x Share A bombshell...	<div><p class="js-video-placeholder video-plac...	{president,sources,youre,report,info,headline,...	Just under an hour later, Fox News ran a banne...	64770