In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from matplotlib.path import Path

matplotlib.style.use('ggplot')

import math
import time
import datetime
import json

from pandas import DataFrame, Series
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))


[nltk_data] Downloading package punkt to /Users/mrpozzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

In [3]:
#version = ""
version = "_07_07_2016"
with open('amis_articles{0}.jsonl'.format(version)) as f:
    articles = pd.DataFrame(json.loads(line) for line in f)

articles['date'] = pd.to_datetime(articles['date'])
articles['timestamp'] = articles['date'].apply(lambda d: time.mktime(d.timetuple()))
articles = articles.sort('date', ascending=1)

articles['raw_article'] = articles['article'] 

sources = list(articles['source'].unique())

In [ ]:


In [12]:
# process the articles
def clean_and_tokenize_article(article):
    tokenized_article = word_tokenize(article)
    tokenized_article = [regex.sub(u'', token).lower() for token in tokenized_article]
    tokenized_article = filter(lambda x: not x in stopwords.words('english') + [u''], tokenized_article)
    return tokenized_article

articles['article'] = articles['raw_article'].apply(clean_and_tokenize_article)
articles['article'].head(5)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
KeyboardInterrupt: 

In [ ]:


In [11]:
def define_sentiment(article, sid = SentimentIntensityAnalyzer()):
    sentences = nltk.tokenize.sent_tokenize(article)
    cumulative = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        for key in cumulative.keys():
            cumulative[key] += ss[key]
    for key in cumulative.keys():
        cumulative[key] /= len(sentences)
    return cumulative

articles['sentiment'] = articles['raw_article'].apply(define_sentiment)

In [ ]:


In [ ]:


In [14]:
ss


Out[14]:
{'compound': 0.4404, 'neg': 0.0, 'neu': 0.923, 'pos': 0.077}

In [ ]: