notebook.community

Edit and run



In [1]:

    
%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json









    



/Library/Python/2.7/site-packages/pytz/__init__.py:29: UserWarning: Module pyparsing was already imported from /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pyparsing.pyc, but /Library/Python/2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream
/Library/Python/2.7/site-packages/pytz/__init__.py:29: UserWarning: Module six was already imported from /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/six.pyc, but /Library/Python/2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream



In [2]:

    
#import news from pig
news = pd.read_csv('extracted_topics', sep='	', names=['CountryID', 'SequenceID', 'Timestamp','Title','Story','Keywords','Country','Region'])



In [3]:

    
#parse the date
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news.head()









    Out[3]:






  
    
      
      CountryID
      SequenceID
      Timestamp
      Title
      Story
      Keywords
      Country
      Region
    
  
  
    
      0
        74
      -22068
      1939-08-01
       World donors urge power-sharing deal for Somalia
       International donors urged Somalia's interim g...
       {(enjoyed broad public support,14.5),(host nat...
             Somalia
       ESARO
    
    
      1
        27
      -16464
      1954-12-04
                                                    NaN
       In Northern Bahr El Ghazal and Warrap States, ...
       {(northern bahr el ghazal,16.0),(increasing ci...
               Sudan
        MENA
    
    
      2
        27
       -3866
      1989-06-01
                                                    NaN
       A joint UN and AU team led by Jean-Marie Guehe...
       {(au team led,7.0),(secretary general,4.0),(je...
               Sudan
        MENA
    
    
      3
        60
        -582
      1998-05-29
                                                    NaN
       Eritrea on May 28, 1998, mobilised thousands o...
       {(border area,4.0),(reinforce troops,4.0),(mob...
             Eritrea
       ESARO
    
    
      4
       145
         298
      2000-10-26
                                                    NaN
                                      militants du FPI."
                                {(militants du fpi,9.0)}
       Cote d'Ivoire
       WCARO



In [4]:

    
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    filteredNews = news[news['Keywords'].str.contains(topic)]
    dates = filteredNews['Timestamp'].tolist()
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount



In [8]:

    
keyword_dict = {}
id_counter = 0

sample_keywords = "{(enjoyed broad public support,14.5),(host nation sweden acknowledged,13.5),(international donors urged somalia,12.833333333333334),(international contact group,8.166666666666668),(rebuild nation,5.0),(formal donors,5.0),(contact group,4.666666666666667),(large swathes,4.0),(planned talks,4.0),(us-backed warlords,4.0),(interim government,4.0),(european nations,4.0),(strong signal,4.0),(african states,4.0),(capital mogadishu,4.0),(arab league,4.0),(power-sharing deal,4.0),(somalia meeting,3.8333333333333335),(sweden,2.5),(support,2.5),(somalia,2.3333333333333335),(meeting,1.5),(seized,1.0),(rome,1.0),(year,1.0),(strike,1.0),(includes,1.0),(stockholm,1.0),(give,1.0),(wanted,1.0),(hold,1.0),(led,1.0),(south,1.0),(soder,1.0),(tuesday,1.0),(ready,1.0),(islamists,1.0),(information,1.0),(sides,1.0),(send,1.0),(distributed,1.0),(money,1.0),(khartoum,1.0),(italy,1.0),(table,1.0),(vital,1.0),(reconstruction,1.0),(week,1.0)}"

def refine_keywords(sample_keywords):
    global id_counter
    max_words_length = 4
    min_words_length = 1
    min_char_length = 5
    
    keywords = {}
    sample_keywords = sample_keywords.replace('{','')
    sample_keywords = sample_keywords.replace('}','')
    sample_keywords = sample_keywords.replace("000 ","")

    keywords_list = sample_keywords.split("),(")
    
    final_keywords = []
    
    for keyword_score in keywords_list :
        
        keyword_score = keyword_score.replace('(','')
        keyword_score = keyword_score.replace(')','')
        
        if keyword_score != ("") :
            keyword = keyword_score.split(",")[0]
            score = keyword_score.split(",")[1]
        
            keyword_length = len(keyword.split(" "))
            

            if len(keyword) > min_char_length:
                if ((keyword_length < max_words_length) and (keyword_length > min_words_length)):
                    digits = 0
                    alpha = 0
                    for i in range(0, len(keyword)):
                        if keyword[i].isdigit():
                            digits += 1
                        elif keyword[i].isalpha():
                            alpha += 1
                    if alpha != 0:
                        if digits < alpha:
                            keyword = keyword.replace("/","")
                            keyword = keyword.replace("-","")
                            
                            keyword = keyword.replace(" _ ","")
                            
                            if keyword != "000":
                                if keyword in keyword_dict:
                                    keyword_dict[keyword]['count'] += 1
                                else :
                                    keyword_dict[keyword] = {}
                                    keyword_dict[keyword]['count'] = 1
                                    
                                    keyword_dict[keyword]['id'] = id_counter
                                    id_counter += 1
                                final_keywords.append(keyword_dict[keyword]['id'])
    string = " ".join(str(x) for x in final_keywords)
    
    return string

#print refine_keywords(sample_keywords)



In [9]:

    
refined_news = news
refined_news['Keywords'] = refined_news['Keywords'].apply(refine_keywords)



In [10]:

    
refined_news['Count'] = refined_news['Keywords'].apply(lambda x : len(x))
refined_news.head()









    Out[10]:






  
    
      
      CountryID
      SequenceID
      Timestamp
      Title
      Story
      Keywords
      Country
      Region
      Count
    
  
  
    
      0
        74
      -22068
      1939-08-01
       World donors urge power-sharing deal for Somalia
       International donors urged Somalia's interim g...
           0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
             Somalia
       ESARO
       34
    
    
      1
        27
      -16464
      1954-12-04
                                                    NaN
       In Northern Bahr El Ghazal and Warrap States, ...
       15 16 17 18 19 20 21 22 23 24 25 26 27
               Sudan
        MENA
       38
    
    
      2
        27
       -3866
      1989-06-01
                                                    NaN
       A joint UN and AU team led by Jean-Marie Guehe...
                            28 29 30 31 32 33
               Sudan
        MENA
       17
    
    
      3
        60
        -582
      1998-05-29
                                                    NaN
       Eritrea on May 28, 1998, mobilised thousands o...
                                     34 35 36
             Eritrea
       ESARO
        8
    
    
      4
       145
         298
      2000-10-26
                                                    NaN
                                      militants du FPI."
                                           37
       Cote d'Ivoire
       WCARO
        2



In [11]:

    
r_input_refined = refined_news[['CountryID','SequenceID','Count','Keywords']]
r_input_refined = r_input_refined[r_input_refined['SequenceID'] >0]
r_input_refined = r_input_refined.sort(['CountryID', 'SequenceID'], ascending=[1, 1])
r_input_refined.head()









    Out[11]:






  
    
      
      CountryID
      SequenceID
      Count
      Keywords
    
  
  
    
      36
       1
       1989
       35
                     302 303 304 305 306 307 187 308 309
    
    
      79
       1
       2095
       67
       673 674 675 676 677 678 679 680 681 682 683 68...
    
    
      90
       1
       2108
       47
         761 763 764 765 773 683 774 187 767 775 772 771
    
    
      91
       1
       2108
       19
                                     776 777 778 779 780
    
    
      92
       1
       2108
       51
       761 763 764 765 773 683 774 187 767 781 775 77...



In [13]:

    
r_input_refined.to_csv("r_input_refined", index=False, sep="	", header=False)



In [15]:

    
keyword_list = pd.DataFrame(keyword_dict.items()) 
keyword_list.head()









    Out[15]:






  
    
      
      0
      1
    
  
  
    
      0
                    10 refugee children
       {u'count': 1, u'id': 234597}
    
    
      1
             military order prohibiting
       {u'count': 1, u'id': 262355}
    
    
      2
                       personal effects
        {u'count': 2, u'id': 85320}
    
    
      3
                         signoghin area
       {u'count': 1, u'id': 101866}
    
    
      4
       transitional civilian government
       {u'count': 1, u'id': 215498}



In [16]:

    
keyword_list['count'] = keyword_list[1].apply(lambda x : x['count']) 
keyword_list['id'] = keyword_list[1].apply(lambda x : x['id']) 
keyword_list['keyword'] = keyword_list[0]
keyword_list = keyword_list.sort(['id'], ascending=1)
keyword_list_final = keyword_list[['id','keyword','count']]



In [20]:

    
keyword_list_final= keyword_list_final.sort(['count'], ascending=0)
keyword_list_final









    Out[20]:






  
    
      
      id
      keyword
      count
    
  
  
    
      185368
           74
                      security forces
       1133
    
    
      3956  
         3119
                           death toll
        759
    
    
      261312
          585
                         human rights
        639
    
    
      169039
         2938
                            sri lanka
        611
    
    
      32488 
         1084
                          heavy rains
        523
    
    
      202482
           98
                     security council
        519
    
    
      162745
         1148
                       prime minister
        513
    
    
      113378
        13926
                          south sudan
        509
    
    
      17274 
          225
                       united nations
        492
    
    
      102555
         4349
                            shot dead
        480
    
    
      53357 
         3047
                    government forces
        477
    
    
      105307
         3205
                            civil war
        476
    
    
      249605
         2606
                        united states
        471
    
    
      122916
         2873
              international community
        408
    
    
      190347
         2030
                          peace talks
        386
    
    
      62016 
         2236
                         recent weeks
        368
    
    
      192219
         3659
                           gaza strip
        349
    
    
      233683
         2058
                             bird flu
        346
    
    
      144060
          246
                          aid workers
        346
    
    
      188127
         3968
                      police officers
        328
    
    
      216438
         5943
                        recent months
        302
    
    
      21338 
          305
                    government troops
        301
    
    
      208618
         3853
                            red cross
        299
    
    
      157241
         3273
                    interior ministry
        293
    
    
      48874 
         3702
                   human rights watch
        292
    
    
      263464
           70
                         burkina faso
        287
    
    
      237797
          724
                          people died
        283
    
    
      15467 
          187
                         armed groups
        281
    
    
      203271
       163196
                   syrian observatory
        275
    
    
      227484
          351
                         south africa
        270
    
    
      ...
      ...
      ...
      ...
    
    
      61404 
       252805
              turkistan islamic party
          1
    
    
      54019 
       252822
                28eu member countries
          1
    
    
      128783
       252823
       extraordinary moldovaeu summit
          1
    
    
      241448
       252824
            visiting european council
          1
    
    
      253001
       252825
               killing carlos castano
          1
    
    
      127266
       252826
                 pueblo bello village
          1
    
    
      142564
       252827
           largest paramilitary group
          1
    
    
      227749
       252828
                  senior paramilitary
          1
    
    
      211976
       252829
                      auc demobilised
          1
    
    
      148541
       252830
                              movil 5
          1
    
    
      113738
       252831
       training includes instructions
          1
    
    
      264181
       252832
                       combat tactics
          1
    
    
      58679 
       252833
                french patrol stopped
          1
    
    
      158078
       252821
                     ec official told
          1
    
    
      44120 
       252804
                          rostov area
          1
    
    
      147250
       252788
                       venue location
          1
    
    
      148714
       252803
                       ukraine border
          1
    
    
      65547 
       252775
                    reelection battle
          1
    
    
      6692  
       252776
                              shiite 
          1
    
    
      261785
       252777
                      schoolteachers 
          1
    
    
      4897  
       252778
                      joined students
          1
    
    
      139445
       252779
                        double murder
          1
    
    
      12210 
       252780
           earlier tentatively agreed
          1
    
    
      253060
       252782
                       small hospital
          1
    
    
      95814 
       252783
                        including 144
          1
    
    
      64753 
       252784
                  commission official
          1
    
    
      217070
       252785
                radical demonstrators
          1
    
    
      158638
       252786
                 government spokesmen
          1
    
    
      125565
       252787
                  government tomorrow
          1
    
    
      115337
        69134
                         eve bombings
          1
    
  

269013 rows × 3 columns



In [21]:

    
keyword_list_final.to_csv('keyword_list_sorted.csv',index=False)



In [145]:

    
with open('keyword_dict.json', 'w') as fp:
    json.dump(keyword_dict, fp)

	CountryID	SequenceID	Timestamp	Title	Story	Keywords	Country	Region
0	74	-22068	1939-08-01	World donors urge power-sharing deal for Somalia	International donors urged Somalia's interim g...	{(enjoyed broad public support,14.5),(host nat...	Somalia	ESARO
1	27	-16464	1954-12-04	NaN	In Northern Bahr El Ghazal and Warrap States, ...	{(northern bahr el ghazal,16.0),(increasing ci...	Sudan	MENA
2	27	-3866	1989-06-01	NaN	A joint UN and AU team led by Jean-Marie Guehe...	{(au team led,7.0),(secretary general,4.0),(je...	Sudan	MENA
3	60	-582	1998-05-29	NaN	Eritrea on May 28, 1998, mobilised thousands o...	{(border area,4.0),(reinforce troops,4.0),(mob...	Eritrea	ESARO
4	145	298	2000-10-26	NaN	militants du FPI."	{(militants du fpi,9.0)}	Cote d'Ivoire	WCARO

	CountryID	SequenceID	Count	Keywords
36	1	1989	35	302 303 304 305 306 307 187 308 309
79	1	2095	67	673 674 675 676 677 678 679 680 681 682 683 68...
90	1	2108	47	761 763 764 765 773 683 774 187 767 775 772 771
91	1	2108	19	776 777 778 779 780
92	1	2108	51	761 763 764 765 773 683 774 187 767 781 775 77...

	0	1
0	10 refugee children	{u'count': 1, u'id': 234597}
1	military order prohibiting	{u'count': 1, u'id': 262355}
2	personal effects	{u'count': 2, u'id': 85320}
3	signoghin area	{u'count': 1, u'id': 101866}
4	transitional civilian government	{u'count': 1, u'id': 215498}

	id	keyword	count
185368	74	security forces	1133
3956	3119	death toll	759
261312	585	human rights	639
169039	2938	sri lanka	611
32488	1084	heavy rains	523
202482	98	security council	519
162745	1148	prime minister	513
113378	13926	south sudan	509
17274	225	united nations	492
102555	4349	shot dead	480
53357	3047	government forces	477
105307	3205	civil war	476
249605	2606	united states	471
122916	2873	international community	408
190347	2030	peace talks	386
62016	2236	recent weeks	368
192219	3659	gaza strip	349
233683	2058	bird flu	346
144060	246	aid workers	346
188127	3968	police officers	328
216438	5943	recent months	302
21338	305	government troops	301
208618	3853	red cross	299
157241	3273	interior ministry	293
48874	3702	human rights watch	292
263464	70	burkina faso	287
237797	724	people died	283
15467	187	armed groups	281
203271	163196	syrian observatory	275
227484	351	south africa	270
...	...	...	...
61404	252805	turkistan islamic party	1
54019	252822	28eu member countries	1
128783	252823	extraordinary moldovaeu summit	1
241448	252824	visiting european council	1
253001	252825	killing carlos castano	1
127266	252826	pueblo bello village	1
142564	252827	largest paramilitary group	1
227749	252828	senior paramilitary	1
211976	252829	auc demobilised	1
148541	252830	movil 5	1
113738	252831	training includes instructions	1
264181	252832	combat tactics	1
58679	252833	french patrol stopped	1
158078	252821	ec official told	1
44120	252804	rostov area	1
147250	252788	venue location	1
148714	252803	ukraine border	1
65547	252775	reelection battle	1
6692	252776	shiite	1
261785	252777	schoolteachers	1
4897	252778	joined students	1
139445	252779	double murder	1
12210	252780	earlier tentatively agreed	1
253060	252782	small hospital	1
95814	252783	including 144	1
64753	252784	commission official	1
217070	252785	radical demonstrators	1
158638	252786	government spokesmen	1
125565	252787	government tomorrow	1
115337	69134	eve bombings	1