notebook.community

Edit and run



In [39]:

    
import pandas as pd 
from pandas import Series, DataFrame, Panel
import numpy as np
import pandas as pd
import io
import requests
import time
import seaborn as sns
import string
import pickle
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
df = pd.read_csv('/Users/mueric35/Box Sync/nytimes_articles/article_time.csv')
df = df.iloc[:,1:3]
df['article'] = df.article.apply(str)
df['time'] = pd.to_datetime(df['time'], format = '%d-%m-%y')



In [3]:

    
df_article_sum = df.groupby('article').aggregate('count').reset_index()
df_article_sum.sort_values('time',ascending=False).head(10)



In [4]:

    
df_time_sum = df.groupby('time').aggregate('count').reset_index()



In [5]:

    
df_time_sum = df.groupby('time').aggregate('count').reset_index()
df_time_sum.columns = ['time','Count']



In [6]:

    
topic = ['17','12','0','7','4']



In [7]:

    
import matplotlib.pyplot as plt

j = 0
for i in topic:
    select_article = df[df.article == i]
    select_count = select_article.groupby(['time']).agg('count').reset_index()
    select_count = select_count.merge(df_time_sum)
    select_count['trans_count'] = select_count.article/select_count.Count
    select_series = select_count[['time','trans_count']]
    select_array = np.array(select_series.trans_count)
    pp = Series(select_array, index= select_series.time)
    plt.figure(figsize = (10,8))
    pp.plot(color = 'darkblue')
    plt.title('Topic:' + i)
    plt.show()
    j+=1

Calculate Topic Count Array



In [8]:

    
topics_sorted = df_article_sum.sort_values('time',ascending=False).article



In [9]:

    
topics_array = []



In [10]:

    
df_full = df_time_sum



In [11]:

    
for i in topics_sorted:
    select_article = df[df.article == i]
    select_count = select_article.groupby(['time']).agg('count').reset_index()
    select_count = select_count.merge(df_time_sum)
    select_count['trans_count'] = select_count.article/select_count.Count
    select_series = select_count[['time','trans_count']]
    select_array = np.array(select_series.trans_count)
    pp = pd.DataFrame(Series(select_array, index= select_series.time)).reset_index()
    pp.columns = ['time', i]
    df_full = df_full.merge(pp, on = 'time', how = 'left')

Select NA remove Threshold



In [12]:

    
index = df_full.isnull().mean()
index_t = ['time']
index_t.extend(list(index[index<0.10].index[2:]))



In [13]:

    
df_full = df_full[index_t].fillna(0)



In [15]:

    
##This function is modified to retrived most historical stock data from NASDAQ and NYSX
##This function cannot retrieve historical index data since Google Finance does not support downding this type of data
##This function return dataframe
month_exe = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
def google_stocks(symbol, startdate, enddate = None):
    
    startdate = month_exe[startdate[0]] + '%20' + str(startdate[1]) + '%2C%20' + str(startdate[2])

    enddate = month_exe[enddate[0]] + '%20' + str(enddate[1]) + '%2C%20' + str(enddate[2])
    
    stock_url = 'https://finance.google.com/finance/historical?q=' + symbol + \
                '&startdate=' + startdate + '&enddate=' + enddate + '&output=csv'
 
    raw_response = requests.get(stock_url).content
 
    stock_data = pd.read_csv(io.StringIO(raw_response.decode('utf-8')))
 
    return stock_data



In [16]:

    
startdate = (7,1,2016)
enddate   = (7,1,2017)



In [17]:

    
Symbol = ['NASDAQ:GOOG','NYSE:UVV','NYSE:BP','NYSE:CLD','NYSE:BHP','NYSE:RIO']



In [18]:

    
for i in Symbol:
    symbol = i
    df_name = symbol.split(':')[1] + '_hist'
    exec('{} = google_stocks(symbol, startdate, enddate)'.format(df_name))
    exec('{}.Date = pd.to_datetime({}.Date, format = "%d-%b-%y")'.format(df_name,df_name))
    exec('{} = {}.sort_values("Date")'.format(df_name,df_name))



In [19]:

    
# copy df full array
df_new = df_full.copy()



In [20]:

    
df_new.time = df_new.time + pd.Timedelta(days = 1)



In [21]:

    
# select bhp
bhp_close = BHP_hist[['Date','Close']]
bhp_close.columns = ['time','Close']



In [22]:

    
df_new_1 = bhp_close.merge(df_new, on = 'time', how = 'left')
df_new_1 = df_new_1.dropna(axis = 0)
col_name = list(df_new_1.columns)[2:]



In [25]:

    
l_3 = []
for i in col_name: 
    l_3.append((i,df_new_1['Close'].corr(df_new_1[i], method='spearman')))



In [26]:

    
l_3.sort(key=lambda tup: abs(tup[1]),reverse=True)



In [27]:

    
dict_topic = dict(l_3)

Test Correlation



In [28]:

    
from pydoc import help
from scipy.stats.stats import pearsonr



In [33]:

    
dict_p_value = []
for i,j in dict_topic.items():
    dict_p_value.append([i,pearsonr(df_new_1['Close'],df_new_1[i])[0],pearsonr(df_new_1['Close'],df_new_1[i])[1]])



In [36]:

    
dict_p_value.sort(key = lambda x: abs(x[1]),reverse=True)



In [37]:

    
dict_p_value









    Out[37]:





[['0', 0.39785343500739112, 5.9954139063067341e-11],
 ['16', 0.37532046037694888, 8.1215246922159303e-10],
 ['14', -0.29912358790890198, 1.3908572793017843e-06],
 ['2', 0.24867032452019439, 6.8121279339922864e-05],
 ['1', -0.21975492542923544, 0.00045291114844213425],
 ['17', -0.19336774088311487, 0.0020886707880643069],
 ['15', -0.14030947286410564, 0.026227158939319629],
 ['5', -0.12478109596647249, 0.0482952464052556],
 ['9', -0.10813415204562392, 0.087333882415321737],
 ['4', -0.078883969275970245, 0.21296248686245739],
 ['11', 0.069757523380673522, 0.2709000182110764],
 ['12', -0.064418245007361707, 0.3093728288103475],
 ['10', 0.061536008400850176, 0.33156425553213675],
 ['21', -0.019193438815614902, 0.76220134102732595],
 ['7', 0.01067227847529552, 0.86639328655937453]]



In [43]:

    
link = '/Users/mueric35/Box Sync/nytimes_articles/model/model3.pkl'
link_dict = '/Users/mueric35/Box Sync/nytimes_articles/dict.pkl'

with open(link, "rb") as f:
    LDA_model = pickle.load(f)
f.close()

with open(link_dict, "rb") as f:
    dictionary = pickle.load(f)
f.close()



In [55]:

    
select_topic = []
for i in [16,0,2,14]:
    topic = []
    for k in ([list(j)[0] for j in LDA_model.get_topic_terms(i)]):
            topic.append(dictionary[k])
    select_topic.append((i,topic))



In [57]:

    
for i in select_topic:
    print(i)









    



(16, ['trump', 'news', 'presid', 'mr_trump', 'white_hous', 'hous', 'report', 'white', 'media', 'press'])
(0, ['immigr', 'state', 'countri', 'unit', 'unit_state', 'border', 'trump', 'refuge', 'deport', 'american'])
(2, ['senat', 'session', 'fire', 'committe', 'comey', 'trump', 'investig', 'presid', 'mr_comey', 'mr_session'])
(14, ['polic', 'offic', 'depart', 'citi', 'polic_offic', 'arrest', 'kill', 'shoot', 'investig', 'report'])



In [ ]:

	article	time
9	17	6707
4	12	6234
0	0	4519
67	7	3451
34	4	3267
6	14	2581
3	11	2519
89	9	2216
7	15	2134
45	5	1957