In [1]:
%matplotlib inline
#import required packages
import sys
import datetime
import csv
import math
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json
In [2]:
#import news from pig
news = pd.read_csv('extracted_topics', sep=' ', names=['CountryID', 'SequenceID', 'Timestamp','Title','Story','Keywords','Country','Region'])
In [3]:
#parse the date
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news.head()
Out[3]:
In [4]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
filteredNews = news[news['Keywords'].str.contains(topic)]
dates = filteredNews['Timestamp'].tolist()
datesCount = {}
for date in dates :
if date not in datesCount:
datesCount[date] = 1
else :
datesCount[date] += 1
return datesCount
In [8]:
keyword_dict = {}
id_counter = 0
sample_keywords = "{(enjoyed broad public support,14.5),(host nation sweden acknowledged,13.5),(international donors urged somalia,12.833333333333334),(international contact group,8.166666666666668),(rebuild nation,5.0),(formal donors,5.0),(contact group,4.666666666666667),(large swathes,4.0),(planned talks,4.0),(us-backed warlords,4.0),(interim government,4.0),(european nations,4.0),(strong signal,4.0),(african states,4.0),(capital mogadishu,4.0),(arab league,4.0),(power-sharing deal,4.0),(somalia meeting,3.8333333333333335),(sweden,2.5),(support,2.5),(somalia,2.3333333333333335),(meeting,1.5),(seized,1.0),(rome,1.0),(year,1.0),(strike,1.0),(includes,1.0),(stockholm,1.0),(give,1.0),(wanted,1.0),(hold,1.0),(led,1.0),(south,1.0),(soder,1.0),(tuesday,1.0),(ready,1.0),(islamists,1.0),(information,1.0),(sides,1.0),(send,1.0),(distributed,1.0),(money,1.0),(khartoum,1.0),(italy,1.0),(table,1.0),(vital,1.0),(reconstruction,1.0),(week,1.0)}"
def refine_keywords(sample_keywords):
global id_counter
max_words_length = 4
min_words_length = 1
min_char_length = 5
keywords = {}
sample_keywords = sample_keywords.replace('{','')
sample_keywords = sample_keywords.replace('}','')
sample_keywords = sample_keywords.replace("000 ","")
keywords_list = sample_keywords.split("),(")
final_keywords = []
for keyword_score in keywords_list :
keyword_score = keyword_score.replace('(','')
keyword_score = keyword_score.replace(')','')
if keyword_score != ("") :
keyword = keyword_score.split(",")[0]
score = keyword_score.split(",")[1]
keyword_length = len(keyword.split(" "))
if len(keyword) > min_char_length:
if ((keyword_length < max_words_length) and (keyword_length > min_words_length)):
digits = 0
alpha = 0
for i in range(0, len(keyword)):
if keyword[i].isdigit():
digits += 1
elif keyword[i].isalpha():
alpha += 1
if alpha != 0:
if digits < alpha:
keyword = keyword.replace("/","")
keyword = keyword.replace("-","")
keyword = keyword.replace(" _ ","")
if keyword != "000":
if keyword in keyword_dict:
keyword_dict[keyword]['count'] += 1
else :
keyword_dict[keyword] = {}
keyword_dict[keyword]['count'] = 1
keyword_dict[keyword]['id'] = id_counter
id_counter += 1
final_keywords.append(keyword_dict[keyword]['id'])
string = " ".join(str(x) for x in final_keywords)
return string
#print refine_keywords(sample_keywords)
In [9]:
refined_news = news
refined_news['Keywords'] = refined_news['Keywords'].apply(refine_keywords)
In [10]:
refined_news['Count'] = refined_news['Keywords'].apply(lambda x : len(x))
refined_news.head()
Out[10]:
In [11]:
r_input_refined = refined_news[['CountryID','SequenceID','Count','Keywords']]
r_input_refined = r_input_refined[r_input_refined['SequenceID'] >0]
r_input_refined = r_input_refined.sort(['CountryID', 'SequenceID'], ascending=[1, 1])
r_input_refined.head()
Out[11]:
In [13]:
r_input_refined.to_csv("r_input_refined", index=False, sep=" ", header=False)
In [15]:
keyword_list = pd.DataFrame(keyword_dict.items())
keyword_list.head()
Out[15]:
In [16]:
keyword_list['count'] = keyword_list[1].apply(lambda x : x['count'])
keyword_list['id'] = keyword_list[1].apply(lambda x : x['id'])
keyword_list['keyword'] = keyword_list[0]
keyword_list = keyword_list.sort(['id'], ascending=1)
keyword_list_final = keyword_list[['id','keyword','count']]
In [20]:
keyword_list_final= keyword_list_final.sort(['count'], ascending=0)
keyword_list_final
Out[20]:
In [21]:
keyword_list_final.to_csv('keyword_list_sorted.csv',index=False)
In [145]:
with open('keyword_dict.json', 'w') as fp:
json.dump(keyword_dict, fp)