In [109]:
%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot

In [110]:
#import news from pig
news = pd.read_csv('extracted_topics', sep='	', names=['CountryID', 'SequenceID', 'Timestamp','Title','Story','Keywords','Country','Region'])

In [111]:
#parse the date
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news.head()


Out[111]:
CountryID SequenceID Timestamp Title Story Keywords Country Region
0 74 -22068 1939-08-01 World donors urge power-sharing deal for Somalia International donors urged Somalia's interim g... {(enjoyed broad public support,14.5),(host nat... Somalia ESARO
1 27 -16464 1954-12-04 NaN In Northern Bahr El Ghazal and Warrap States, ... {(northern bahr el ghazal,16.0),(increasing ci... Sudan MENA
2 27 -3866 1989-06-01 NaN A joint UN and AU team led by Jean-Marie Guehe... {(au team led,7.0),(secretary general,4.0),(je... Sudan MENA
3 60 -582 1998-05-29 NaN Eritrea on May 28, 1998, mobilised thousands o... {(border area,4.0),(reinforce troops,4.0),(mob... Eritrea ESARO
4 145 298 2000-10-26 NaN militants du FPI." {(militants du fpi,9.0)} Cote d'Ivoire WCARO

In [112]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    filteredNews = news[news['Keywords'].str.contains(topic)]
    dates = filteredNews['Timestamp'].tolist()
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount

In [113]:
#generate graph
def drawDateCount(datesCount):
    x = []
    y = []
    
    for dates in datesCount:
        x.append(np.datetime64(dates))
        y.append(datesCount[dates])
        
    style.use('ggplot')
    plt.figure(figsize=(12,8))
    plt.plot(x, y, color = "SteelBlue",label = 'Taxi Speed')
    plt.xlabel('Date')
    plt.ylabel('Volume')
    plt.title('Taxi Volume per Hour in 2010')
    plt.legend()
    plt.show()

In [129]:
#load treasury data (and percent changes)

treasury = pd.read_csv('treasury.csv', names=['Date', 'PercentChange'],header=True, parse_dates=True)
treasury['Date'] = pd.to_datetime(treasury['Date'].str[:10], format = '%Y-%m-%d')
treasury['PercentChange'] = treasury['PercentChange'].convert_objects(convert_numeric=True)
treasury = treasury.set_index(pd.DatetimeIndex(treasury['Date']))
treasury.head()


Out[129]:
Date PercentChange
2000-01-04 2000-01-04 -1.36778
2000-01-05 2000-01-05 2.00308
2000-01-06 2000-01-06 -0.75529
2000-01-07 2000-01-07 -0.76104
2000-01-10 2000-01-10 0.76687

In [ ]:
frequentTopics = ["minister","people","government","police","security"]

def getChangeAvg(dates):
    valueList = []
    for date in dates :
        newDate = date.strftime('%Y-%m-%d')
        indexList = treasury[treasury['Date']== newDate].index.tolist()
        newTre = treasury.loc[indexList]
        newlist = newTre['PercentChange'].tolist()
        print np.mean(newlist)
        

for topic in frequentTopics :
    dates = getDateCount(topic)
    getChangeAvg(dates)

In [ ]: