In [139]:
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu */
#* Michael Rawson mr4209 */
#* Yixue Wang yw1819@nyu.edu */
#**************************************************/
%matplotlib inline
#import required packages
import sys
import datetime
import csv
import math
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json
In [140]:
#load news sequence from R
news_sequence = pd.read_csv('../data/news_sequence_0025.csv', sep=' ', names=['ID', 'Sequence', 'Confidence'], header=True)
In [141]:
#clean the news
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("{",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("}",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("<",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace(">",""))
In [142]:
#define the variables
news_id_list = news_sequence['ID'].tolist()
news_seq_list = news_sequence['Sequence'].tolist()
news_conf_list = news_sequence['Confidence'].tolist()
news_seq_nested = []
news_conf_nested = []
keyword_map = {}
conf_map = {}
keyword_tree = {}
root = '74' #can be changed
min_conf = 0.1
root_count = 0
root_conf = 0
node_id = 0
keyword_tree = {}
In [143]:
#make the list of news sequence
for seq in news_seq_list :
seq = seq.split(",")
news_seq_nested.append(seq)
In [144]:
#read file from keyword list and save it to dictionary
with open('../data/keyword_list.csv', 'rb') as f:
reader = csv.reader(f)
reader.next() #skipheader
for row in reader:
keyword_id = row[0]
keyword_conf=row[2]
keyword=row[1]
if keyword_id not in keyword_map :
keyword_map[keyword_id] = keyword
conf_map[keyword_id] = keyword_conf
#return name of specific sequence id
def getName(keyword_id):
return keyword_map[keyword_id]
In [145]:
keyword_tree['name'] = getName(root)
keyword_tree['count'] = root_conf
keyword_tree['children_list'] = []
keyword_tree['parent'] = 'null'
keyword_tree['children'] = []
keyword_tree['conf'] = 'null'
#changes children dictionary to list
def checkChildren(child_list):
keyword_list = []
for key in child_list:
keyword_list.append(key)
return keyword_list
#build the tree from sequence
for seq in news_seq_nested:
pos = 0
if seq[0] == root :
keyword_tree['count'] += 1
pos += 1
if (pos < len(seq)):
new_list = [root]
new_list.append(seq[pos])
node_id = news_id_list[news_seq_nested.index(new_list)]
conf_id = news_conf_list[news_seq_nested.index(new_list)]
if node_id not in checkChildren(keyword_tree['children_list']) :
if (conf_id > min_conf):
keyword_tree['children_list'].append(node_id)
keyword_tree[node_id] = {}
keyword_tree[node_id]['name'] = getName(seq[pos])
keyword_tree[node_id]['count'] = 1
keyword_tree[node_id]['parent'] = keyword_tree['name']
keyword_tree['children'].append(keyword_tree[node_id])
keyword_tree[node_id]['conf'] = conf_id
else :
keyword_tree[node_id]['count'] += 1
pos +=1
if (pos < len(seq)):
new_list.append(seq[pos])
keyword_tree[node_id]['children_list'] = []
keyword_tree[node_id]['children'] = []
node_id_2 = news_id_list[news_seq_nested.index(new_list)]
conf_id_2 = news_conf_list[news_seq_nested.index(new_list)]
if seq[pos] not in checkChildren(keyword_tree[node_id]['children_list']) :
if (conf_id_2 > min_conf):
keyword_tree[node_id]['children_list'].append(node_id_2)
keyword_tree[node_id][node_id_2] = {}
keyword_tree[node_id][node_id_2]['name'] = getName(seq[pos])
keyword_tree[node_id][node_id_2]['count'] = 1
keyword_tree[node_id][node_id_2]['parent'] = keyword_tree[node_id]['name']
keyword_tree[node_id][node_id_2]['conf'] = conf_id_2
for idx in keyword_tree['children']:
for key in idx :
if key == 'children':
idx['children'].append(keyword_tree[node_id][node_id_2])
else :
keyword_tree[node_id][node_id_2]['count'] +=1
pos +=1
if (pos<len(seq)):
new_list.append(seq[pos])
keyword_tree[node_id][node_id_2]['children_list'] = []
keyword_tree[node_id][node_id_2]['children'] = []
node_id_3 = news_id_list[news_seq_nested.index(new_list)]
conf_id_3 = news_conf_list[news_seq_nested.index(new_list)]
if seq[pos] not in checkChildren(keyword_tree[node_id][node_id_2]['children_list']) :
if (conf_id_3 > min_conf):
keyword_tree[node_id]['children_list'].append(node_id_3)
keyword_tree[node_id][node_id_2][node_id_3] = {}
keyword_tree[node_id][node_id_2][node_id_3]['name'] = getName(seq[pos])
keyword_tree[node_id][node_id_2][node_id_3]['count'] = 1
keyword_tree[node_id][node_id_2][node_id_3]['parent'] = keyword_tree[node_id][node_id_2]['name']
keyword_tree[node_id][node_id_2][node_id_3]['conf'] = conf_id_3
else :
keyword_tree[node_id][node_id_2][node_id_3]['count'] +=1
In [146]:
#save to json
with open('../app/keyword_tree.json', 'w') as fp:
json.dump(keyword_tree, fp)
In [122]:
In [ ]: