In [55]:
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu */
#* Michael Rawson mr4209 */
#* Yixue Wang yw1819@nyu.edu */
#**************************************************/
%matplotlib inline
#import required packages
import sys
import datetime
import csv
import math
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json
In [234]:
#import news from pig
news = pd.read_csv('../data/extracted_topics_refined', sep=' ', names=['CountryID', 'SequenceID', 'Timestamp',
'Title','Story','Keywords','Country','Region', 'Count'],header=True)
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news['Keywords'] = news['Keywords'].astype(str)
news['Keywords'] = news['Keywords'].apply(lambda x :x.split(" "))
#load news count
news_count = pd.read_csv('../data/keyword_list_sorted.csv', sep=',', names=['ID', 'Topic', 'Count'],header=True)
#load treasury data (and percent changes)
treasury = pd.read_csv('../data/treasury.csv', names=['Date', 'PercentChange'],header=True, parse_dates=True)
treasury['Date'] = pd.to_datetime(treasury['Date'].str[:10], format = '%Y-%m-%d')
treasury['PercentChange'] = treasury['PercentChange'].convert_objects(convert_numeric=True)
treasury = treasury.set_index(pd.DatetimeIndex(treasury['Date']))
In [235]:
treasury.head()
Out[235]:
In [236]:
news.head()
Out[236]:
In [251]:
keywords_list = news['Keywords'].tolist()
keywords_list_int = []
for keyword in keywords_list :
key_list_int = []
for k in keyword[:5] :
if (k != 'nan') :
k = int(k)
else :
k = 0
key_list_int.append(k)
keywords_list_int.append(key_list_int)
In [251]:
In [252]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
dates = []
for keywords in keywords_list_int:
if topic in keywords :
dates.append(news['Timestamp'].loc[keywords_list_int.index(keywords)])
datesCount = {}
for date in dates :
if date not in datesCount:
datesCount[date] = 1
else :
datesCount[date] += 1
return datesCount
In [253]:
def getChangeAvg(dates):
valueList = []
for date in dates :
newDate = date.strftime('%Y-%m-%d')
indexList = treasury[treasury['Date']== newDate].index.tolist()
newTre = treasury.loc[indexList]
newlist = newTre['PercentChange'].tolist()
if (len(newlist)!=0):
valueList.append(newlist[0])
return np.nanmean(valueList)
In [ ]:
topic_avg = []
frequentTopics = news_count['ID'].tolist()
#for each topic, get the average percent changes
for topic in frequentTopics :
avg = 0
dates = getDateCount(topic)
avg = getChangeAvg(dates)
topic_avg.append(avg)
In [249]:
In [249]:
#store to csv
news_count_change = news_count[:100]
print len(news_count_change)
news_count_change['Changes'] = topic_avg
news_count_change = news_count_change
news_count_change.to_csv('../data/keyword_list_count.csv',index=False)
In [ ]: