In [369]:
%matplotlib inline
#import required packages
import sys
import datetime
import csv
import math
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json
In [396]:
news_sequence = pd.read_csv('../data/news_sequence_0025.csv', sep=' ', names=['ID', 'Sequence', 'Confidence'], header=True)
In [397]:
news_sequence
Out[397]:
In [398]:
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("{",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("}",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("<",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace(">",""))
In [399]:
news_id_list = news_sequence['ID'].tolist()
news_seq_list = news_sequence['Sequence'].tolist()
news_conf_list = news_sequence['Confidence'].tolist()
news_seq_nested = []
news_conf_nested = []
In [400]:
for seq in news_seq_list :
seq = seq.split(",")
news_seq_nested.append(seq)
In [401]:
keyword_map = {}
conf_map = {}
with open('keyword_list.csv', 'rb') as f:
reader = csv.reader(f)
reader.next() #skipheader
for row in reader:
keyword_id = row[0]
keyword_conf=row[2]
keyword=row[1]
if keyword_id not in keyword_map :
keyword_map[keyword_id] = keyword
conf_map[keyword_id] = keyword_conf
def getName(keyword_id):
return keyword_map[keyword_id]
def getConf(keyword_id):
return conf_map[keyword_id]
In [402]:
keyword_tree = {}
root = '74' #"death toll"
root_count = 0
node_id = 0
keyword_tree = {}
keyword_tree['name'] = getName(root)
keyword_tree['count'] = root_conf
keyword_tree['children_list'] = []
keyword_tree['parent'] = 'null'
keyword_tree['children'] = []
keyword_tree['conf'] = 'null'
def checkChildren(child_list):
keyword_list = []
for key in child_list:
keyword_list.append(key)
return keyword_list
for seq in news_seq_nested:
pos = 0
if seq[0] == root :
keyword_tree['count'] += 1
pos += 1
if (pos < len(seq)):
new_list = [root]
new_list.append(seq[pos])
node_id = news_id_list[news_seq_nested.index(new_list)]
conf_id = news_conf_list[news_seq_nested.index(new_list)]
#print seq[pos], checkChildren(keyword_tree[0]['children'])
if node_id not in checkChildren(keyword_tree['children_list']):
keyword_tree['children_list'].append(node_id)
keyword_tree[node_id] = {}
keyword_tree[node_id]['name'] = getName(seq[pos])
keyword_tree[node_id]['count'] = 1
keyword_tree[node_id]['parent'] = keyword_tree['name']
keyword_tree['children'].append(keyword_tree[node_id])
keyword_tree[node_id]['conf'] = conf_id
else :
#print keyword_tree[0][node_id]
keyword_tree[node_id]['count'] += 1
pos +=1
#check the third
if (pos < len(seq)):
new_list.append(seq[pos])
keyword_tree[node_id]['children_list'] = []
keyword_tree[node_id]['children'] = []
node_id_2 = news_id_list[news_seq_nested.index(new_list)]
conf_id_2 = news_conf_list[news_seq_nested.index(new_list)]
if seq[pos] not in checkChildren(keyword_tree[node_id]['children_list']):
keyword_tree[node_id]['children_list'].append(node_id_2)
keyword_tree[node_id][node_id_2] = {}
keyword_tree[node_id][node_id_2]['name'] = getName(seq[pos])
keyword_tree[node_id][node_id_2]['count'] = 1
keyword_tree[node_id][node_id_2]['parent'] = keyword_tree[node_id]['name']
keyword_tree[node_id][node_id_2]['conf'] = conf_id_2
#keyword_tree[0]['children'].append(keyword_tree[0][node_id][node_id_2])
for idx in keyword_tree['children']:
for key in idx :
if key == 'children':
idx['children'].append(keyword_tree[node_id][node_id_2])
else :
#check the third
keyword_tree[node_id][node_id_2]['count'] +=1
In [403]:
with open('../app/keyword_tree.json', 'w') as fp:
json.dump(keyword_tree, fp)
In [326]:
In [326]:
In [ ]: