In [39]:
import pandas as pd 
from pandas import Series, DataFrame, Panel
import numpy as np
import pandas as pd
import io
import requests
import time
import seaborn as sns
import string
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/mueric35/Box Sync/nytimes_articles/article_time.csv')
df = df.iloc[:,1:3]
df['article'] = df.article.apply(str)
df['time'] = pd.to_datetime(df['time'], format = '%d-%m-%y')

In [3]:
df_article_sum = df.groupby('article').aggregate('count').reset_index()
df_article_sum.sort_values('time',ascending=False).head(10)


Out[3]:
article time
9 17 6707
4 12 6234
0 0 4519
67 7 3451
34 4 3267
6 14 2581
3 11 2519
89 9 2216
7 15 2134
45 5 1957

In [4]:
df_time_sum = df.groupby('time').aggregate('count').reset_index()

In [5]:
df_time_sum = df.groupby('time').aggregate('count').reset_index()
df_time_sum.columns = ['time','Count']

In [6]:
topic = ['17','12','0','7','4']

In [7]:
import matplotlib.pyplot as plt

j = 0
for i in topic:
    select_article = df[df.article == i]
    select_count = select_article.groupby(['time']).agg('count').reset_index()
    select_count = select_count.merge(df_time_sum)
    select_count['trans_count'] = select_count.article/select_count.Count
    select_series = select_count[['time','trans_count']]
    select_array = np.array(select_series.trans_count)
    pp = Series(select_array, index= select_series.time)
    plt.figure(figsize = (10,8))
    pp.plot(color = 'darkblue')
    plt.title('Topic:' + i)
    plt.show()
    j+=1


Calculate Topic Count Array


In [8]:
topics_sorted = df_article_sum.sort_values('time',ascending=False).article

In [9]:
topics_array = []

In [10]:
df_full = df_time_sum

In [11]:
for i in topics_sorted:
    select_article = df[df.article == i]
    select_count = select_article.groupby(['time']).agg('count').reset_index()
    select_count = select_count.merge(df_time_sum)
    select_count['trans_count'] = select_count.article/select_count.Count
    select_series = select_count[['time','trans_count']]
    select_array = np.array(select_series.trans_count)
    pp = pd.DataFrame(Series(select_array, index= select_series.time)).reset_index()
    pp.columns = ['time', i]
    df_full = df_full.merge(pp, on = 'time', how = 'left')

Select NA remove Threshold


In [12]:
index = df_full.isnull().mean()
index_t = ['time']
index_t.extend(list(index[index<0.10].index[2:]))

In [13]:
df_full = df_full[index_t].fillna(0)

In [15]:
##This function is modified to retrived most historical stock data from NASDAQ and NYSX
##This function cannot retrieve historical index data since Google Finance does not support downding this type of data
##This function return dataframe
month_exe = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
def google_stocks(symbol, startdate, enddate = None):
    
    startdate = month_exe[startdate[0]] + '%20' + str(startdate[1]) + '%2C%20' + str(startdate[2])

    enddate = month_exe[enddate[0]] + '%20' + str(enddate[1]) + '%2C%20' + str(enddate[2])
    
    stock_url = 'https://finance.google.com/finance/historical?q=' + symbol + \
                '&startdate=' + startdate + '&enddate=' + enddate + '&output=csv'
 
    raw_response = requests.get(stock_url).content
 
    stock_data = pd.read_csv(io.StringIO(raw_response.decode('utf-8')))
 
    return stock_data

In [16]:
startdate = (7,1,2016)
enddate   = (7,1,2017)

In [17]:
Symbol = ['NASDAQ:GOOG','NYSE:UVV','NYSE:BP','NYSE:CLD','NYSE:BHP','NYSE:RIO']

In [18]:
for i in Symbol:
    symbol = i
    df_name = symbol.split(':')[1] + '_hist'
    exec('{} = google_stocks(symbol, startdate, enddate)'.format(df_name))
    exec('{}.Date = pd.to_datetime({}.Date, format = "%d-%b-%y")'.format(df_name,df_name))
    exec('{} = {}.sort_values("Date")'.format(df_name,df_name))

In [19]:
# copy df full array
df_new = df_full.copy()

In [20]:
df_new.time = df_new.time + pd.Timedelta(days = 1)

In [21]:
# select bhp
bhp_close = BHP_hist[['Date','Close']]
bhp_close.columns = ['time','Close']

In [22]:
df_new_1 = bhp_close.merge(df_new, on = 'time', how = 'left')
df_new_1 = df_new_1.dropna(axis = 0)
col_name = list(df_new_1.columns)[2:]

In [25]:
l_3 = []
for i in col_name: 
    l_3.append((i,df_new_1['Close'].corr(df_new_1[i], method='spearman')))

In [26]:
l_3.sort(key=lambda tup: abs(tup[1]),reverse=True)

In [27]:
dict_topic = dict(l_3)

Test Correlation


In [28]:
from pydoc import help
from scipy.stats.stats import pearsonr

In [33]:
dict_p_value = []
for i,j in dict_topic.items():
    dict_p_value.append([i,pearsonr(df_new_1['Close'],df_new_1[i])[0],pearsonr(df_new_1['Close'],df_new_1[i])[1]])

In [36]:
dict_p_value.sort(key = lambda x: abs(x[1]),reverse=True)

In [37]:
dict_p_value


Out[37]:
[['0', 0.39785343500739112, 5.9954139063067341e-11],
 ['16', 0.37532046037694888, 8.1215246922159303e-10],
 ['14', -0.29912358790890198, 1.3908572793017843e-06],
 ['2', 0.24867032452019439, 6.8121279339922864e-05],
 ['1', -0.21975492542923544, 0.00045291114844213425],
 ['17', -0.19336774088311487, 0.0020886707880643069],
 ['15', -0.14030947286410564, 0.026227158939319629],
 ['5', -0.12478109596647249, 0.0482952464052556],
 ['9', -0.10813415204562392, 0.087333882415321737],
 ['4', -0.078883969275970245, 0.21296248686245739],
 ['11', 0.069757523380673522, 0.2709000182110764],
 ['12', -0.064418245007361707, 0.3093728288103475],
 ['10', 0.061536008400850176, 0.33156425553213675],
 ['21', -0.019193438815614902, 0.76220134102732595],
 ['7', 0.01067227847529552, 0.86639328655937453]]

In [43]:
link = '/Users/mueric35/Box Sync/nytimes_articles/model/model3.pkl'
link_dict = '/Users/mueric35/Box Sync/nytimes_articles/dict.pkl'

with open(link, "rb") as f:
    LDA_model = pickle.load(f)
f.close()

with open(link_dict, "rb") as f:
    dictionary = pickle.load(f)
f.close()

In [55]:
select_topic = []
for i in [16,0,2,14]:
    topic = []
    for k in ([list(j)[0] for j in LDA_model.get_topic_terms(i)]):
            topic.append(dictionary[k])
    select_topic.append((i,topic))

In [57]:
for i in select_topic:
    print(i)


(16, ['trump', 'news', 'presid', 'mr_trump', 'white_hous', 'hous', 'report', 'white', 'media', 'press'])
(0, ['immigr', 'state', 'countri', 'unit', 'unit_state', 'border', 'trump', 'refuge', 'deport', 'american'])
(2, ['senat', 'session', 'fire', 'committe', 'comey', 'trump', 'investig', 'presid', 'mr_comey', 'mr_session'])
(14, ['polic', 'offic', 'depart', 'citi', 'polic_offic', 'arrest', 'kill', 'shoot', 'investig', 'report'])

In [ ]: