In [1]:
%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json


/Library/Python/2.7/site-packages/pytz/__init__.py:29: UserWarning: Module pyparsing was already imported from /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pyparsing.pyc, but /Library/Python/2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream
/Library/Python/2.7/site-packages/pytz/__init__.py:29: UserWarning: Module six was already imported from /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/six.pyc, but /Library/Python/2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
#import news from pig
news = pd.read_csv('extracted_topics', sep='	', names=['CountryID', 'SequenceID', 'Timestamp','Title','Story','Keywords','Country','Region'])

In [3]:
#parse the date
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news.head()


Out[3]:
CountryID SequenceID Timestamp Title Story Keywords Country Region
0 74 -22068 1939-08-01 World donors urge power-sharing deal for Somalia International donors urged Somalia's interim g... {(enjoyed broad public support,14.5),(host nat... Somalia ESARO
1 27 -16464 1954-12-04 NaN In Northern Bahr El Ghazal and Warrap States, ... {(northern bahr el ghazal,16.0),(increasing ci... Sudan MENA
2 27 -3866 1989-06-01 NaN A joint UN and AU team led by Jean-Marie Guehe... {(au team led,7.0),(secretary general,4.0),(je... Sudan MENA
3 60 -582 1998-05-29 NaN Eritrea on May 28, 1998, mobilised thousands o... {(border area,4.0),(reinforce troops,4.0),(mob... Eritrea ESARO
4 145 298 2000-10-26 NaN militants du FPI." {(militants du fpi,9.0)} Cote d'Ivoire WCARO

In [4]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    filteredNews = news[news['Keywords'].str.contains(topic)]
    dates = filteredNews['Timestamp'].tolist()
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount

In [8]:
keyword_dict = {}
id_counter = 0

sample_keywords = "{(enjoyed broad public support,14.5),(host nation sweden acknowledged,13.5),(international donors urged somalia,12.833333333333334),(international contact group,8.166666666666668),(rebuild nation,5.0),(formal donors,5.0),(contact group,4.666666666666667),(large swathes,4.0),(planned talks,4.0),(us-backed warlords,4.0),(interim government,4.0),(european nations,4.0),(strong signal,4.0),(african states,4.0),(capital mogadishu,4.0),(arab league,4.0),(power-sharing deal,4.0),(somalia meeting,3.8333333333333335),(sweden,2.5),(support,2.5),(somalia,2.3333333333333335),(meeting,1.5),(seized,1.0),(rome,1.0),(year,1.0),(strike,1.0),(includes,1.0),(stockholm,1.0),(give,1.0),(wanted,1.0),(hold,1.0),(led,1.0),(south,1.0),(soder,1.0),(tuesday,1.0),(ready,1.0),(islamists,1.0),(information,1.0),(sides,1.0),(send,1.0),(distributed,1.0),(money,1.0),(khartoum,1.0),(italy,1.0),(table,1.0),(vital,1.0),(reconstruction,1.0),(week,1.0)}"

def refine_keywords(sample_keywords):
    global id_counter
    max_words_length = 4
    min_words_length = 1
    min_char_length = 5
    
    keywords = {}
    sample_keywords = sample_keywords.replace('{','')
    sample_keywords = sample_keywords.replace('}','')
    sample_keywords = sample_keywords.replace("000 ","")

    keywords_list = sample_keywords.split("),(")
    
    final_keywords = []
    
    for keyword_score in keywords_list :
        
        keyword_score = keyword_score.replace('(','')
        keyword_score = keyword_score.replace(')','')
        
        if keyword_score != ("") :
            keyword = keyword_score.split(",")[0]
            score = keyword_score.split(",")[1]
        
            keyword_length = len(keyword.split(" "))
            

            if len(keyword) > min_char_length:
                if ((keyword_length < max_words_length) and (keyword_length > min_words_length)):
                    digits = 0
                    alpha = 0
                    for i in range(0, len(keyword)):
                        if keyword[i].isdigit():
                            digits += 1
                        elif keyword[i].isalpha():
                            alpha += 1
                    if alpha != 0:
                        if digits < alpha:
                            keyword = keyword.replace("/","")
                            keyword = keyword.replace("-","")
                            
                            keyword = keyword.replace(" _ ","")
                            
                            if keyword != "000":
                                if keyword in keyword_dict:
                                    keyword_dict[keyword]['count'] += 1
                                else :
                                    keyword_dict[keyword] = {}
                                    keyword_dict[keyword]['count'] = 1
                                    
                                    keyword_dict[keyword]['id'] = id_counter
                                    id_counter += 1
                                final_keywords.append(keyword_dict[keyword]['id'])
    string = " ".join(str(x) for x in final_keywords)
    
    return string

#print refine_keywords(sample_keywords)

In [9]:
refined_news = news
refined_news['Keywords'] = refined_news['Keywords'].apply(refine_keywords)

In [10]:
refined_news['Count'] = refined_news['Keywords'].apply(lambda x : len(x))
refined_news.head()


Out[10]:
CountryID SequenceID Timestamp Title Story Keywords Country Region Count
0 74 -22068 1939-08-01 World donors urge power-sharing deal for Somalia International donors urged Somalia's interim g... 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 Somalia ESARO 34
1 27 -16464 1954-12-04 NaN In Northern Bahr El Ghazal and Warrap States, ... 15 16 17 18 19 20 21 22 23 24 25 26 27 Sudan MENA 38
2 27 -3866 1989-06-01 NaN A joint UN and AU team led by Jean-Marie Guehe... 28 29 30 31 32 33 Sudan MENA 17
3 60 -582 1998-05-29 NaN Eritrea on May 28, 1998, mobilised thousands o... 34 35 36 Eritrea ESARO 8
4 145 298 2000-10-26 NaN militants du FPI." 37 Cote d'Ivoire WCARO 2

In [11]:
r_input_refined = refined_news[['CountryID','SequenceID','Count','Keywords']]
r_input_refined = r_input_refined[r_input_refined['SequenceID'] >0]
r_input_refined = r_input_refined.sort(['CountryID', 'SequenceID'], ascending=[1, 1])
r_input_refined.head()


Out[11]:
CountryID SequenceID Count Keywords
36 1 1989 35 302 303 304 305 306 307 187 308 309
79 1 2095 67 673 674 675 676 677 678 679 680 681 682 683 68...
90 1 2108 47 761 763 764 765 773 683 774 187 767 775 772 771
91 1 2108 19 776 777 778 779 780
92 1 2108 51 761 763 764 765 773 683 774 187 767 781 775 77...

In [13]:
r_input_refined.to_csv("r_input_refined", index=False, sep="	", header=False)

In [15]:
keyword_list = pd.DataFrame(keyword_dict.items()) 
keyword_list.head()


Out[15]:
0 1
0 10 refugee children {u'count': 1, u'id': 234597}
1 military order prohibiting {u'count': 1, u'id': 262355}
2 personal effects {u'count': 2, u'id': 85320}
3 signoghin area {u'count': 1, u'id': 101866}
4 transitional civilian government {u'count': 1, u'id': 215498}

In [16]:
keyword_list['count'] = keyword_list[1].apply(lambda x : x['count']) 
keyword_list['id'] = keyword_list[1].apply(lambda x : x['id']) 
keyword_list['keyword'] = keyword_list[0]
keyword_list = keyword_list.sort(['id'], ascending=1)
keyword_list_final = keyword_list[['id','keyword','count']]

In [20]:
keyword_list_final= keyword_list_final.sort(['count'], ascending=0)
keyword_list_final


Out[20]:
id keyword count
185368 74 security forces 1133
3956 3119 death toll 759
261312 585 human rights 639
169039 2938 sri lanka 611
32488 1084 heavy rains 523
202482 98 security council 519
162745 1148 prime minister 513
113378 13926 south sudan 509
17274 225 united nations 492
102555 4349 shot dead 480
53357 3047 government forces 477
105307 3205 civil war 476
249605 2606 united states 471
122916 2873 international community 408
190347 2030 peace talks 386
62016 2236 recent weeks 368
192219 3659 gaza strip 349
233683 2058 bird flu 346
144060 246 aid workers 346
188127 3968 police officers 328
216438 5943 recent months 302
21338 305 government troops 301
208618 3853 red cross 299
157241 3273 interior ministry 293
48874 3702 human rights watch 292
263464 70 burkina faso 287
237797 724 people died 283
15467 187 armed groups 281
203271 163196 syrian observatory 275
227484 351 south africa 270
... ... ... ...
61404 252805 turkistan islamic party 1
54019 252822 28eu member countries 1
128783 252823 extraordinary moldovaeu summit 1
241448 252824 visiting european council 1
253001 252825 killing carlos castano 1
127266 252826 pueblo bello village 1
142564 252827 largest paramilitary group 1
227749 252828 senior paramilitary 1
211976 252829 auc demobilised 1
148541 252830 movil 5 1
113738 252831 training includes instructions 1
264181 252832 combat tactics 1
58679 252833 french patrol stopped 1
158078 252821 ec official told 1
44120 252804 rostov area 1
147250 252788 venue location 1
148714 252803 ukraine border 1
65547 252775 reelection battle 1
6692 252776 shiite 1
261785 252777 schoolteachers 1
4897 252778 joined students 1
139445 252779 double murder 1
12210 252780 earlier tentatively agreed 1
253060 252782 small hospital 1
95814 252783 including 144 1
64753 252784 commission official 1
217070 252785 radical demonstrators 1
158638 252786 government spokesmen 1
125565 252787 government tomorrow 1
115337 69134 eve bombings 1

269013 rows × 3 columns


In [21]:
keyword_list_final.to_csv('keyword_list_sorted.csv',index=False)

In [145]:
with open('keyword_dict.json', 'w') as fp:
    json.dump(keyword_dict, fp)