In [28]:
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu       */
#* Michael Rawson mr4209             */
#* Yixue Wang yw1819@nyu.edu         */
#**************************************************/

%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json

In [31]:
#import news from pig
news = pd.read_csv("../data/extracted_topics", sep='	', names=['CountryID', 'SequenceID', 'Timestamp','Title','Story','Keywords','Country','Region'])

In [32]:
#parse the date
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')

In [33]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    filteredNews = news[news['Keywords'].str.contains(topic)]
    dates = filteredNews['Timestamp'].tolist()
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount

In [34]:
#load treasury data (and percent changes)
treasury = pd.read_csv('../data/treasury.csv', names=['Date', 'PercentChange'],header=True, parse_dates=True)
treasury['Date'] = pd.to_datetime(treasury['Date'].str[:10], format = '%Y-%m-%d')
treasury['PercentChange'] = treasury['PercentChange'].convert_objects(convert_numeric=True)
treasury = treasury.set_index(pd.DatetimeIndex(treasury['Date']))

In [35]:
keyword_dict = {}
id_counter = 0

sample_keywords = "{(enjoyed broad public support,14.5),(host nation sweden acknowledged,13.5),(international donors urged somalia,12.833333333333334),(international contact group,8.166666666666668),(rebuild nation,5.0),(formal donors,5.0),(contact group,4.666666666666667),(large swathes,4.0),(planned talks,4.0),(us-backed warlords,4.0),(interim government,4.0),(european nations,4.0),(strong signal,4.0),(african states,4.0),(capital mogadishu,4.0),(arab league,4.0),(power-sharing deal,4.0),(somalia meeting,3.8333333333333335),(sweden,2.5),(support,2.5),(somalia,2.3333333333333335),(meeting,1.5),(seized,1.0),(rome,1.0),(year,1.0),(strike,1.0),(includes,1.0),(stockholm,1.0),(give,1.0),(wanted,1.0),(hold,1.0),(led,1.0),(south,1.0),(soder,1.0),(tuesday,1.0),(ready,1.0),(islamists,1.0),(information,1.0),(sides,1.0),(send,1.0),(distributed,1.0),(money,1.0),(khartoum,1.0),(italy,1.0),(table,1.0),(vital,1.0),(reconstruction,1.0),(week,1.0)}"

#refine keywords by specific parameter
def refine_keywords(sample_keywords):
    global id_counter
    max_words_length = 4
    min_words_length = 1
    min_char_length = 5
    
    keywords = {}
    sample_keywords = sample_keywords.replace('{','')
    sample_keywords = sample_keywords.replace('}','')
    sample_keywords = sample_keywords.replace("000 ","")

    keywords_list = sample_keywords.split("),(")
    
    final_keywords = []
    
    for keyword_score in keywords_list :
        
        keyword_score = keyword_score.replace('(','')
        keyword_score = keyword_score.replace(')','')
        
        if keyword_score != ("") :
            keyword = keyword_score.split(",")[0]
            score = keyword_score.split(",")[1]
        
            keyword_length = len(keyword.split(" "))
            

            if len(keyword) > min_char_length:
                if ((keyword_length < max_words_length) and (keyword_length > min_words_length)):
                    digits = 0
                    alpha = 0
                    for i in range(0, len(keyword)):
                        if keyword[i].isdigit():
                            digits += 1
                        elif keyword[i].isalpha():
                            alpha += 1
                    if alpha != 0:
                        if digits < alpha:
                            keyword = keyword.replace("/","")
                            keyword = keyword.replace("-","")
                            
                            keyword = keyword.replace(" _ ","")
                            
                            if keyword != "000":
                                if keyword in keyword_dict:
                                    keyword_dict[keyword]['count'] += 1
                                else :
                                    keyword_dict[keyword] = {}
                                    keyword_dict[keyword]['count'] = 1
                                    
                                    keyword_dict[keyword]['id'] = id_counter
                                    id_counter += 1
                                final_keywords.append(keyword_dict[keyword]['id'])
    string = " ".join(str(x) for x in final_keywords)
    
    return string

In [36]:
#refine keywords for each row, and generate the count for each keywords
refined_news = news
refined_news['Keywords'] = refined_news['Keywords'].apply(refine_keywords)
refined_news['Count'] = refined_news['Keywords'].apply(lambda x : len(x))

In [37]:
refined_news.to_csv("../data/extracted_topics_refined",sep='	',index=None)

In [17]:
#export the table csv to be used by R to mine the pattern
r_input_refined = refined_news[['CountryID','SequenceID','Count','Keywords']]
r_input_refined = r_input_refined[r_input_refined['SequenceID'] >0]
r_input_refined = r_input_refined.sort(['CountryID', 'SequenceID'], ascending=[1, 1])

In [18]:
r_input_refined.to_csv("r_input_refined", index=False, sep="	", header=False)

In [19]:
keyword_list = pd.DataFrame(keyword_dict.items())

In [20]:
#generate keyword id and map
keyword_list['count'] = keyword_list[1].apply(lambda x : x['count']) 
keyword_list['id'] = keyword_list[1].apply(lambda x : x['id']) 
keyword_list['keyword'] = keyword_list[0]
keyword_list = keyword_list.sort(['id'], ascending=1)
keyword_list_final = keyword_list[['id','keyword','count']]

In [21]:
keyword_list_final= keyword_list_final.sort(['count'], ascending=0)

In [21]:
keyword_list_final.to_csv('keyword_list_sorted.csv',index=False)