In [39]:
import pandas as pd
from pandas import Series, DataFrame, Panel
import numpy as np
import pandas as pd
import io
import requests
import time
import seaborn as sns
import string
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_csv('/Users/mueric35/Box Sync/nytimes_articles/article_time.csv')
df = df.iloc[:,1:3]
df['article'] = df.article.apply(str)
df['time'] = pd.to_datetime(df['time'], format = '%d-%m-%y')
In [3]:
df_article_sum = df.groupby('article').aggregate('count').reset_index()
df_article_sum.sort_values('time',ascending=False).head(10)
Out[3]:
In [4]:
df_time_sum = df.groupby('time').aggregate('count').reset_index()
In [5]:
df_time_sum = df.groupby('time').aggregate('count').reset_index()
df_time_sum.columns = ['time','Count']
In [6]:
topic = ['17','12','0','7','4']
In [7]:
import matplotlib.pyplot as plt
j = 0
for i in topic:
select_article = df[df.article == i]
select_count = select_article.groupby(['time']).agg('count').reset_index()
select_count = select_count.merge(df_time_sum)
select_count['trans_count'] = select_count.article/select_count.Count
select_series = select_count[['time','trans_count']]
select_array = np.array(select_series.trans_count)
pp = Series(select_array, index= select_series.time)
plt.figure(figsize = (10,8))
pp.plot(color = 'darkblue')
plt.title('Topic:' + i)
plt.show()
j+=1
In [8]:
topics_sorted = df_article_sum.sort_values('time',ascending=False).article
In [9]:
topics_array = []
In [10]:
df_full = df_time_sum
In [11]:
for i in topics_sorted:
select_article = df[df.article == i]
select_count = select_article.groupby(['time']).agg('count').reset_index()
select_count = select_count.merge(df_time_sum)
select_count['trans_count'] = select_count.article/select_count.Count
select_series = select_count[['time','trans_count']]
select_array = np.array(select_series.trans_count)
pp = pd.DataFrame(Series(select_array, index= select_series.time)).reset_index()
pp.columns = ['time', i]
df_full = df_full.merge(pp, on = 'time', how = 'left')
In [12]:
index = df_full.isnull().mean()
index_t = ['time']
index_t.extend(list(index[index<0.10].index[2:]))
In [13]:
df_full = df_full[index_t].fillna(0)
In [15]:
##This function is modified to retrived most historical stock data from NASDAQ and NYSX
##This function cannot retrieve historical index data since Google Finance does not support downding this type of data
##This function return dataframe
month_exe = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
def google_stocks(symbol, startdate, enddate = None):
startdate = month_exe[startdate[0]] + '%20' + str(startdate[1]) + '%2C%20' + str(startdate[2])
enddate = month_exe[enddate[0]] + '%20' + str(enddate[1]) + '%2C%20' + str(enddate[2])
stock_url = 'https://finance.google.com/finance/historical?q=' + symbol + \
'&startdate=' + startdate + '&enddate=' + enddate + '&output=csv'
raw_response = requests.get(stock_url).content
stock_data = pd.read_csv(io.StringIO(raw_response.decode('utf-8')))
return stock_data
In [16]:
startdate = (7,1,2016)
enddate = (7,1,2017)
In [17]:
Symbol = ['NASDAQ:GOOG','NYSE:UVV','NYSE:BP','NYSE:CLD','NYSE:BHP','NYSE:RIO']
In [18]:
for i in Symbol:
symbol = i
df_name = symbol.split(':')[1] + '_hist'
exec('{} = google_stocks(symbol, startdate, enddate)'.format(df_name))
exec('{}.Date = pd.to_datetime({}.Date, format = "%d-%b-%y")'.format(df_name,df_name))
exec('{} = {}.sort_values("Date")'.format(df_name,df_name))
In [19]:
# copy df full array
df_new = df_full.copy()
In [20]:
df_new.time = df_new.time + pd.Timedelta(days = 1)
In [21]:
# select bhp
bhp_close = BHP_hist[['Date','Close']]
bhp_close.columns = ['time','Close']
In [22]:
df_new_1 = bhp_close.merge(df_new, on = 'time', how = 'left')
df_new_1 = df_new_1.dropna(axis = 0)
col_name = list(df_new_1.columns)[2:]
In [25]:
l_3 = []
for i in col_name:
l_3.append((i,df_new_1['Close'].corr(df_new_1[i], method='spearman')))
In [26]:
l_3.sort(key=lambda tup: abs(tup[1]),reverse=True)
In [27]:
dict_topic = dict(l_3)
In [28]:
from pydoc import help
from scipy.stats.stats import pearsonr
In [33]:
dict_p_value = []
for i,j in dict_topic.items():
dict_p_value.append([i,pearsonr(df_new_1['Close'],df_new_1[i])[0],pearsonr(df_new_1['Close'],df_new_1[i])[1]])
In [36]:
dict_p_value.sort(key = lambda x: abs(x[1]),reverse=True)
In [37]:
dict_p_value
Out[37]:
In [43]:
link = '/Users/mueric35/Box Sync/nytimes_articles/model/model3.pkl'
link_dict = '/Users/mueric35/Box Sync/nytimes_articles/dict.pkl'
with open(link, "rb") as f:
LDA_model = pickle.load(f)
f.close()
with open(link_dict, "rb") as f:
dictionary = pickle.load(f)
f.close()
In [55]:
select_topic = []
for i in [16,0,2,14]:
topic = []
for k in ([list(j)[0] for j in LDA_model.get_topic_terms(i)]):
topic.append(dictionary[k])
select_topic.append((i,topic))
In [57]:
for i in select_topic:
print(i)
In [ ]: