In [139]:
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu       */
#* Michael Rawson mr4209             */
#* Yixue Wang yw1819@nyu.edu         */
#**************************************************/

%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json

In [140]:
#load news sequence from R
news_sequence = pd.read_csv('../data/news_sequence_0025.csv', sep='	', names=['ID', 'Sequence', 'Confidence'], header=True)

In [141]:
#clean the news
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("{",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("}",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace("<",""))
news_sequence['Sequence'] = news_sequence['Sequence'].apply(lambda x : x.replace(">",""))

In [142]:
#define the variables
news_id_list = news_sequence['ID'].tolist()
news_seq_list = news_sequence['Sequence'].tolist()
news_conf_list = news_sequence['Confidence'].tolist()
news_seq_nested = []
news_conf_nested = []
keyword_map = {}
conf_map = {}
keyword_tree = {}
root = '74' #can be changed
min_conf = 0.1
root_count = 0
root_conf = 0
node_id = 0
keyword_tree = {}

In [143]:
#make the list of news sequence
for seq in news_seq_list :
    seq = seq.split(",")
    news_seq_nested.append(seq)

In [144]:
#read file from keyword list and save it to dictionary
with open('../data/keyword_list.csv', 'rb') as f:
    reader = csv.reader(f)
    reader.next() #skipheader
    
    for row in reader:
        keyword_id = row[0]
        keyword_conf=row[2]
        keyword=row[1]
        
        if keyword_id not in keyword_map :
            keyword_map[keyword_id] = keyword
            conf_map[keyword_id] = keyword_conf

#return name of specific sequence id           
def getName(keyword_id):
    return keyword_map[keyword_id]

In [145]:
keyword_tree['name'] = getName(root)
keyword_tree['count'] = root_conf
keyword_tree['children_list'] = []
keyword_tree['parent'] = 'null'
keyword_tree['children'] = []
keyword_tree['conf'] = 'null'

#changes children dictionary to list 
def checkChildren(child_list):
    keyword_list = []
    for key in child_list:    
        keyword_list.append(key) 
    return keyword_list

#build the tree from sequence 
for seq in news_seq_nested:
    pos = 0
    if seq[0] == root :
        keyword_tree['count'] += 1     
        pos += 1
        if (pos < len(seq)):
            new_list = [root]
            new_list.append(seq[pos])
            node_id = news_id_list[news_seq_nested.index(new_list)]  
            conf_id = news_conf_list[news_seq_nested.index(new_list)]  
            if node_id not in checkChildren(keyword_tree['children_list']) :  
                if (conf_id > min_conf):
                    keyword_tree['children_list'].append(node_id)
                    keyword_tree[node_id] = {}
                    keyword_tree[node_id]['name'] = getName(seq[pos])
                    keyword_tree[node_id]['count'] = 1
                    keyword_tree[node_id]['parent'] = keyword_tree['name']
                    keyword_tree['children'].append(keyword_tree[node_id])
                    keyword_tree[node_id]['conf'] = conf_id
            else :
                keyword_tree[node_id]['count'] += 1
                pos +=1
                if (pos < len(seq)):
                    new_list.append(seq[pos])
                    keyword_tree[node_id]['children_list'] = []
                    keyword_tree[node_id]['children'] = []
                    node_id_2 = news_id_list[news_seq_nested.index(new_list)]
                    conf_id_2 = news_conf_list[news_seq_nested.index(new_list)]
                    if seq[pos] not in checkChildren(keyword_tree[node_id]['children_list']) :    
                        if (conf_id_2 > min_conf):
                            keyword_tree[node_id]['children_list'].append(node_id_2)
                            keyword_tree[node_id][node_id_2] = {}
                            keyword_tree[node_id][node_id_2]['name'] = getName(seq[pos])
                            keyword_tree[node_id][node_id_2]['count'] = 1
                            keyword_tree[node_id][node_id_2]['parent'] = keyword_tree[node_id]['name']
                            keyword_tree[node_id][node_id_2]['conf'] = conf_id_2
                            for idx in keyword_tree['children']:
                                for key in idx :
                                    if key == 'children':
                                        idx['children'].append(keyword_tree[node_id][node_id_2])
                    else :
                        keyword_tree[node_id][node_id_2]['count'] +=1
                        pos +=1
                        if (pos<len(seq)):
                            new_list.append(seq[pos])
                            keyword_tree[node_id][node_id_2]['children_list'] = []
                            keyword_tree[node_id][node_id_2]['children'] = []
                            node_id_3 = news_id_list[news_seq_nested.index(new_list)]
                            conf_id_3 = news_conf_list[news_seq_nested.index(new_list)]
                            if seq[pos] not in checkChildren(keyword_tree[node_id][node_id_2]['children_list']) :       
                                if (conf_id_3 > min_conf):
                                    keyword_tree[node_id]['children_list'].append(node_id_3)
                                    keyword_tree[node_id][node_id_2][node_id_3] = {}
                                    keyword_tree[node_id][node_id_2][node_id_3]['name'] = getName(seq[pos])
                                    keyword_tree[node_id][node_id_2][node_id_3]['count'] = 1
                                    keyword_tree[node_id][node_id_2][node_id_3]['parent'] = keyword_tree[node_id][node_id_2]['name']
                                    keyword_tree[node_id][node_id_2][node_id_3]['conf'] = conf_id_3
                            else :
                                keyword_tree[node_id][node_id_2][node_id_3]['count'] +=1

In [146]:
#save to json
with open('../app/keyword_tree.json', 'w') as fp:
    json.dump(keyword_tree, fp)

In [122]:


In [ ]: