In [369]:
%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json

In [396]:
news_sequence = pd.read_csv('../data/news_sequence_0025.csv', sep='	', names=['ID', 'Sequence', 'Confidence'], header=True)

In [397]:
news_sequence


Out[397]:
ID Sequence Confidence
0 2 <{10}> 0.035714
1 3 <{100}> 0.190476
2 4 <{100007}> 0.035714
3 5 <{100021}> 0.023810
4 6 <{100043}> 0.023810
5 7 <{100048}> 0.023810
6 8 <{100058}> 0.047619
7 9 <{10006}> 0.077381
8 10 <{10007}> 0.130952
9 11 <{10008}> 0.041667
10 12 <{100080}> 0.023810
11 13 <{1001}> 0.077381
12 14 <{100119}> 0.029762
13 15 <{10020}> 0.023810
14 16 <{10023}> 0.261905
15 17 <{10026}> 0.071429
16 18 <{100261}> 0.023810
17 19 <{100262}> 0.023810
18 20 <{100270}> 0.047619
19 21 <{100320}> 0.041667
20 22 <{10033}> 0.059524
21 23 <{100349}> 0.047619
22 24 <{10035}> 0.035714
23 25 <{100352}> 0.023810
24 26 <{10036}> 0.190476
25 27 <{100389}> 0.023810
26 28 <{10039}> 0.071429
27 29 <{1004}> 0.113095
28 30 <{10040}> 0.184524
29 31 <{100409}> 0.023810
... ... ... ...
25218 25220 <{585},{10153}> 0.023810
25219 25221 <{74},{10153}> 0.047619
25220 25222 <{98},{10153}> 0.029762
25221 25223 <{74},{74},{10153}> 0.023810
25222 25224 <{305},{305},{10153}> 0.023810
25223 25225 <{3047},{3047},{10153}> 0.029762
25224 25226 <{305},{3047},{10153}> 0.029762
25225 25227 <{2118},{10144}> 0.023810
25226 25228 <{98},{10142}> 0.023810
25227 25229 <{1012},{1012}> 0.029762
25228 25230 <{16242},{1012}> 0.023810
25229 25231 <{2873},{1012}> 0.023810
25230 25232 <{8493},{1012}> 0.023810
25231 25233 <{3119},{10090}> 0.023810
25232 25234 <{5943},{100610}> 0.023810
25233 25235 <{3119},{10040}> 0.023810
25234 25236 <{3119},{10036}> 0.023810
25235 25237 <{3205},{10036}> 0.023810
25236 25238 <{585},{10036}> 0.023810
25237 25239 <{74},{10036}> 0.041667
25238 25240 <{225},{10023}> 0.023810
25239 25241 <{2606},{10023}> 0.023810
25240 25242 <{3047},{10023}> 0.023810
25241 25243 <{3205},{10023}> 0.035714
25242 25244 <{3545},{10023}> 0.023810
25243 25245 <{4349},{10023}> 0.029762
25244 25246 <{225},{10007}> 0.029762
25245 25247 <{225},{100}> 0.023810
25246 25248 <{2606},{100}> 0.029762
25247 25249 <{74},{100}> 0.023810

25248 rows × 3 columns


In [398]:
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("{",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("}",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("<",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace(">",""))

In [399]:
news_id_list = news_sequence['ID'].tolist()
news_seq_list = news_sequence['Sequence'].tolist()
news_conf_list = news_sequence['Confidence'].tolist()
news_seq_nested = []
news_conf_nested = []

In [400]:
for seq in news_seq_list :
    seq = seq.split(",")
    news_seq_nested.append(seq)

In [401]:
keyword_map = {}
conf_map = {}

with open('keyword_list.csv', 'rb') as f:
    reader = csv.reader(f)
    reader.next() #skipheader
    
    for row in reader:
        keyword_id = row[0]
        keyword_conf=row[2]
        keyword=row[1]
        
        if keyword_id not in keyword_map :
            keyword_map[keyword_id] = keyword
            conf_map[keyword_id] = keyword_conf
            
def getName(keyword_id):
    return keyword_map[keyword_id]

def getConf(keyword_id):
    return conf_map[keyword_id]

In [402]:
keyword_tree = {}
root = '74' #"death toll"
root_count = 0
node_id = 0

keyword_tree = {}
keyword_tree['name'] = getName(root)
keyword_tree['count'] = root_conf
keyword_tree['children_list'] = []
keyword_tree['parent'] = 'null'
keyword_tree['children'] = []
keyword_tree['conf'] = 'null'

def checkChildren(child_list):
    keyword_list = []
    for key in child_list:    
        keyword_list.append(key)
    
    return keyword_list
        

for seq in news_seq_nested:
    pos = 0
    if seq[0] == root :
        keyword_tree['count'] += 1     
        pos += 1
        if (pos < len(seq)):
            new_list = [root]
            new_list.append(seq[pos])
            node_id = news_id_list[news_seq_nested.index(new_list)]  
            conf_id = news_conf_list[news_seq_nested.index(new_list)]  
            #print seq[pos], checkChildren(keyword_tree[0]['children'])
            if node_id not in checkChildren(keyword_tree['children_list']):  
                keyword_tree['children_list'].append(node_id)
                keyword_tree[node_id] = {}
                keyword_tree[node_id]['name'] = getName(seq[pos])
                keyword_tree[node_id]['count'] = 1
                keyword_tree[node_id]['parent'] = keyword_tree['name']
                keyword_tree['children'].append(keyword_tree[node_id])
                keyword_tree[node_id]['conf'] = conf_id
            else :
                #print keyword_tree[0][node_id]
                keyword_tree[node_id]['count'] += 1
                pos +=1
                #check the third
                if (pos < len(seq)):
                    new_list.append(seq[pos])
                    keyword_tree[node_id]['children_list'] = []
                    keyword_tree[node_id]['children'] = []
                    node_id_2 = news_id_list[news_seq_nested.index(new_list)]
                    conf_id_2 = news_conf_list[news_seq_nested.index(new_list)]
                    if seq[pos] not in checkChildren(keyword_tree[node_id]['children_list']):       
                        keyword_tree[node_id]['children_list'].append(node_id_2)
                        keyword_tree[node_id][node_id_2] = {}
                        keyword_tree[node_id][node_id_2]['name'] = getName(seq[pos])
                        keyword_tree[node_id][node_id_2]['count'] = 1
                        keyword_tree[node_id][node_id_2]['parent'] = keyword_tree[node_id]['name']
                        keyword_tree[node_id][node_id_2]['conf'] = conf_id_2
                        
                        
                        #keyword_tree[0]['children'].append(keyword_tree[0][node_id][node_id_2])
                        for idx in keyword_tree['children']:
                            for key in idx :
                                if key == 'children':
                                    idx['children'].append(keyword_tree[node_id][node_id_2])
                                    
                    else :
                        #check the third
                        keyword_tree[node_id][node_id_2]['count'] +=1

In [403]:
with open('../app/keyword_tree.json', 'w') as fp:
    json.dump(keyword_tree, fp)

In [326]:


In [326]:


In [ ]: