In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from matplotlib.path import Path
matplotlib.style.use('ggplot')
import math
import time
import datetime
import json
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
In [3]:
#version = ""
version = "_07_07_2016"
with open('amis_articles{0}.jsonl'.format(version)) as f:
articles = pd.DataFrame(json.loads(line) for line in f)
articles['date'] = pd.to_datetime(articles['date'])
articles['timestamp'] = articles['date'].apply(lambda d: time.mktime(d.timetuple()))
articles = articles.sort('date', ascending=1)
articles['raw_article'] = articles['article']
sources = list(articles['source'].unique())
In [ ]:
In [12]:
# process the articles
def clean_and_tokenize_article(article):
tokenized_article = word_tokenize(article)
tokenized_article = [regex.sub(u'', token).lower() for token in tokenized_article]
tokenized_article = filter(lambda x: not x in stopwords.words('english') + [u''], tokenized_article)
return tokenized_article
articles['article'] = articles['raw_article'].apply(clean_and_tokenize_article)
articles['article'].head(5)
In [ ]:
In [11]:
def define_sentiment(article, sid = SentimentIntensityAnalyzer()):
sentences = nltk.tokenize.sent_tokenize(article)
cumulative = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for key in cumulative.keys():
cumulative[key] += ss[key]
for key in cumulative.keys():
cumulative[key] /= len(sentences)
return cumulative
articles['sentiment'] = articles['raw_article'].apply(define_sentiment)
In [ ]:
In [ ]:
In [14]:
ss
Out[14]:
In [ ]: