In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
In [46]:
cc = pd.read_csv('./posts_ccompare_raw.csv', index_col=0, encoding='utf-8')
cc['Timestamp'] = pd.to_datetime(cc['Timestamp'])
In [47]:
features_reactions = pd.DataFrame(index=cc.index)
features_reactions['n_up'] = cc['Actions.Agree.Total']
features_reactions['n_down'] = cc['Actions.Disagree.Total']
features_reactions['n_reply'] = cc['Actions.Comment.Total']
In [48]:
sns.pairplot(features_reactions)
Out[48]:
In [49]:
features_date = pd.DataFrame(index=cc.index)
features_date['t_week'] = cc.Timestamp.dt.week
features_date['t_dow'] = cc.Timestamp.dt.dayofweek
features_date['t_hour'] = cc.Timestamp.dt.hour
features_date['t_day'] = cc.Timestamp.dt.day
In [50]:
sns.pairplot(features_date)
Out[50]:
In [51]:
import spacy # See "Installing spaCy"
nlp = spacy.load('en') # You are here.
In [81]:
spacy_docs = pd.DataFrame(index=cc.index)
docs = cc.Body.apply(nlp)
vec = docs.apply(lambda x: x.vector)
feature_word_vec = pd.DataFrame(vec.tolist(), columns=['spacy_%s'%i for i in range(300)])
In [87]:
feature_word_vec['spacy_sent'] = docs.apply(lambda x: x.sentiment)
In [129]:
# tfidf
In [132]:
'''
Author: Giovanni Kastanja
Python: 3.6.0
Date: 24/6/2017
'''
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix
text = cc['Body']
# create a stopset (words that occur to many times)
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
features_tfidf = pd.DataFrame(vectorizer.fit_transform(text).toarray())
In [ ]:
In [ ]:
In [ ]:
In [7]:
targets = pd.read_csv('./btc-ind.csv')
targets['date'] = pd.to_datetime(targets['Date'])
targets = targets.set_index('date')
del targets['Date']
targets.tail()
Out[7]:
In [8]:
join_by_date = pd.DataFrame(index=cc.index)
join_by_date['date'] = cc.Timestamp.dt.round(freq="d")
In [88]:
Y_all = join_by_date.join(targets, on='date').dropna()
groups = Y_all['date']
del Y_all['date']
cols = Y_all.columns
index = Y_all.index
#Y_all = pd.DataFrame(normalize(Y_all, axis=1, norm='l2'), columns=cols, index=index)
Y_all = Y_all - Y_all.mean()
Y_all = Y_all/Y_all.std()
#Y_all.plot()
In [175]:
#features = pd.concat([features_date, features_tfidf, features_reactions, feature_word_vec], axis=1)
features = pd.concat([features_date, features_reactions, feature_word_vec], axis=1)
In [176]:
X_all = features.ix[Y_all.index]
X_all.shape
Out[176]:
In [ ]:
In [135]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import normalize
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso
In [138]:
rf = RandomForestRegressor(n_estimators=10, max_depth=3, criterion='mse')
xgb = XGBRegressor(n_estimators=10)
regressors = [rf, Lasso()]
In [ ]:
In [139]:
target_scores = {}
for indicator in targets.columns:
Y =Y_all[indicator]
for reg in regressors:
tag = indicator+':'+str(reg)[:15]
scores = cross_val_score(reg, X_all, Y, cv=4, groups=groups, scoring='neg_mean_squared_error')
print np.mean(scores), tag
target_scores[tag] = scores
cv_score = pd.DataFrame(target_scores)
In [140]:
ms = cv_score.mean(axis=0)
ms.sort_values(ascending=False)
Out[140]:
In [146]:
indicator = 'BTC_cbrt_dv_T1:Lasso(alpha=1.0'
indicator = indicator.split(":")[0]
Y = Y_all[indicator]
reg = XGBRegressor(n_estimators=100)
reg.fit(X_all, Y)
Y_t = reg.predict(X_all)
error = abs(Y - Y_t)
In [147]:
error.hist()
Out[147]:
In [148]:
# DROP THE BULL$HIT
itruth = error < error.quantile(0.3)
X = X_all[itruth]
Y = Y_all[indicator][itruth]
G = groups[itruth]
In [149]:
reg = XGBRegressor(n_estimators=100, max_depth=8)
scores = cross_val_score(reg, X, Y, cv=4, groups=G, scoring='neg_mean_squared_error')
print sorted(scores)
In [150]:
ax = groups.hist(figsize=(12,5))
G.hist(ax=ax)
Out[150]:
In [ ]:
In [163]:
reg = XGBRegressor(n_estimators=100, max_depth=8)
reg.fit(X,Y)
Y_ = reg.predict(X)
truth_df = pd.DataFrame({'date': G, 'Y': Y_})
In [164]:
Out[164]:
In [157]:
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}
In [160]:
ax = targets.BTC_cbrt_dv_T1.plot()
truth.plot(ax=ax)
Out[160]:
In [162]:
truth
Out[162]:
In [188]:
def drop_bs(indicator, q=0.3):
Y = Y_all[indicator]
reg = XGBRegressor(n_estimators=100)
reg.fit(X_all, Y)
Y_t = reg.predict(X_all)
error = abs(Y - Y_t)
error.hist()
itruth = error < error.quantile(q)
X = X_all[itruth]
Y = Y_all[indicator][itruth]
G = groups[itruth]
reg = XGBRegressor(n_estimators=30, max_depth=5)
scores = cross_val_score(reg, X, Y, cv=4, groups=G, scoring='neg_mean_squared_error')
print sorted(scores)
print "MEAN CV SCORE: ", np.mean(scores)
reg = XGBRegressor(n_estimators=100, max_depth=8)
reg.fit(X,Y)
Y_ = reg.predict(X)
agg = pd.Series(Y_).groupby(G)
truthscore = agg.mean()
impact_count = agg.count()
truth_max = agg.max()
return pd.DataFrame(dict(truthscore=truthscore, impact_count=impact_count, truth_max=truth_max, date=truthscore.index))
In [189]:
dv = drop_bs('BTC_cbrt_dv_T1', 0.4)
In [195]:
import json
def to_json(df, path):
a = []
for i,d in list(df.iterrows()):
d = d.to_dict()
d['date'] = str(d['date'])
a.append(d)
with open(path, 'w') as f:
json.dump(a, f)
to_json(dv, '../bitcoin-daily-bars/out-truth-volume.json')
In [ ]:
In [205]:
impactfull = cc.ix[itruth.index][itruth]
In [207]:
impactfull.head()
Out[207]:
In [232]:
f = 'Cryptopian.Name'
a = impactfull.groupby(f).size()
b = cc.groupby(f).size()
c = pd.DataFrame(dict(a=a,b=b))
c = c[c.a>1]
c['impact'] = c.a/c.b
c.sort_values('impact', ascending=False)
Out[232]:
In [225]:
In [ ]:
In [240]:
dv.truthscore.plot()
Out[240]:
In [ ]:
target_sc