In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline


/Users/pokutnik/anaconda3/envs/ekans/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [46]:
cc = pd.read_csv('./posts_ccompare_raw.csv', index_col=0, encoding='utf-8')
cc['Timestamp'] = pd.to_datetime(cc['Timestamp'])

Reaction features


In [47]:
features_reactions = pd.DataFrame(index=cc.index)
features_reactions['n_up'] = cc['Actions.Agree.Total']
features_reactions['n_down'] = cc['Actions.Disagree.Total']
features_reactions['n_reply'] = cc['Actions.Comment.Total']

In [48]:
sns.pairplot(features_reactions)


Out[48]:
<seaborn.axisgrid.PairGrid at 0x11d9e8410>

Post date features


In [49]:
features_date = pd.DataFrame(index=cc.index)
features_date['t_week'] = cc.Timestamp.dt.week
features_date['t_dow'] = cc.Timestamp.dt.dayofweek
features_date['t_hour'] = cc.Timestamp.dt.hour
features_date['t_day'] = cc.Timestamp.dt.day

In [50]:
sns.pairplot(features_date)


Out[50]:
<seaborn.axisgrid.PairGrid at 0x120a42350>

Spacy NLP ...


In [51]:
import spacy                           # See "Installing spaCy"
nlp = spacy.load('en')                 # You are here.

In [81]:
spacy_docs = pd.DataFrame(index=cc.index)
docs = cc.Body.apply(nlp)
vec = docs.apply(lambda x: x.vector)
feature_word_vec = pd.DataFrame(vec.tolist(), columns=['spacy_%s'%i for i in range(300)])

In [87]:
feature_word_vec['spacy_sent'] = docs.apply(lambda x: x.sentiment)

In [129]:
# tfidf

In [132]:
'''
Author: Giovanni Kastanja
Python: 3.6.0
Date: 24/6/2017
'''
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix
text = cc['Body']
# create a stopset (words that occur to many times)
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
features_tfidf = pd.DataFrame(vectorizer.fit_transform(text).toarray())

In [ ]:


In [ ]:


In [ ]:

Target


In [7]:
targets = pd.read_csv('./btc-ind.csv')
targets['date'] = pd.to_datetime(targets['Date'])
targets = targets.set_index('date')
del targets['Date']
targets.tail()


Out[7]:
BTC_pd_T0 BTC_rpd_T0 BTC_v_T0 BTC_rpd_T1 BTC_v_T1 BTC_pd_T2 BTC_v_T2 BTC_s_T2 BTC_dh_m3 BTC_dl_m3 ... BTC_log_s_T2 BTC_cbrt_rpd_T0 BTC_cbrt_pd_T1 BTC_cbrt_dv_T1 BTC_cbrt_hl_T1 BTC_cbrt_pd_T2 BTC_cbrt_dh_m3 BTC_cbrt_dl_m3 BTC_cbrt_do_m3 BTC_cbrt_dp_m3
date
2017-06-18 -115.54 -0.043516 92004.22 0.037392 81829.22 99.28 111303.55 3.000340e+08 41.650000 62.363333 ... 19.519406 -0.351736 -3.370213 -21.669296 5.054862 4.630422 3.466344 3.965608 4.437038 -2.179750
2017-06-19 77.26 0.030423 81829.22 0.054364 111303.55 138.06 116033.61 3.132347e+08 -43.750000 31.386667 ... 19.562463 0.312175 5.989054 30.889768 6.555346 5.168398 -3.523649 3.154388 -4.005548 -3.349197
2017-06-20 137.56 0.052568 111303.55 0.040515 116033.61 106.02 78297.60 2.103188e+08 102.390000 73.530000 ... 19.164135 0.374604 3.932190 16.786322 5.753231 4.732921 4.678276 4.189429 -2.718934 2.434569
2017-06-21 -76.76 -0.027868 116033.61 -0.015978 78297.60 -44.01 56938.10 1.553034e+08 63.530000 38.646667 ... 18.860891 -0.303182 -3.159516 -33.541720 5.104085 -3.530616 3.990184 3.380939 4.149354 2.317179
2017-06-22 45.22 0.016888 78297.60 -0.041268 56938.10 -110.50 65779.46 1.740187e+08 -30.083333 2.670000 ... 18.974673 0.256563 3.199414 -27.745786 5.148732 -4.798669 -3.110107 1.387300 -3.438659 -2.539463

5 rows × 23 columns


In [8]:
join_by_date = pd.DataFrame(index=cc.index)
join_by_date['date'] = cc.Timestamp.dt.round(freq="d")

In [88]:
Y_all = join_by_date.join(targets, on='date').dropna()
groups = Y_all['date']
del Y_all['date']
cols = Y_all.columns
index = Y_all.index
#Y_all = pd.DataFrame(normalize(Y_all, axis=1, norm='l2'), columns=cols, index=index)
Y_all = Y_all - Y_all.mean()
Y_all = Y_all/Y_all.std()
#Y_all.plot()

Combine features


In [175]:
#features = pd.concat([features_date, features_tfidf, features_reactions, feature_word_vec], axis=1)
features = pd.concat([features_date, features_reactions, feature_word_vec], axis=1)

In [176]:
X_all = features.ix[Y_all.index]
X_all.shape


Out[176]:
(2909, 304)

In [ ]:


In [135]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import normalize
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso

In [138]:
rf = RandomForestRegressor(n_estimators=10, max_depth=3, criterion='mse')
xgb = XGBRegressor(n_estimators=10)
regressors = [rf, Lasso()]

In [ ]:


In [139]:
target_scores = {}
for indicator in targets.columns:
    Y =Y_all[indicator]    
    for reg in regressors:
        tag = indicator+':'+str(reg)[:15]
        scores = cross_val_score(reg, X_all, Y, cv=4, groups=groups, scoring='neg_mean_squared_error')
        print np.mean(scores), tag
        target_scores[tag] = scores

cv_score = pd.DataFrame(target_scores)


-1.01510250221 BTC_pd_T0:RandomForestReg
-1.00489834393 BTC_pd_T0:Lasso(alpha=1.0
-1.01362175733 BTC_rpd_T0:RandomForestReg
-1.00700271374 BTC_rpd_T0:Lasso(alpha=1.0
-1.12117041142 BTC_v_T0:RandomForestReg
-1.12333273979 BTC_v_T0:Lasso(alpha=1.0
-1.0279897913 BTC_rpd_T1:RandomForestReg
-1.02159575107 BTC_rpd_T1:Lasso(alpha=1.0
-1.14329555947 BTC_v_T1:RandomForestReg
-1.14257585964 BTC_v_T1:Lasso(alpha=1.0
-1.03427313345 BTC_pd_T2:RandomForestReg
-1.02477879379 BTC_pd_T2:Lasso(alpha=1.0
-1.13260911269 BTC_v_T2:RandomForestReg
-1.12544047748 BTC_v_T2:Lasso(alpha=1.0
-1.51004848987 BTC_s_T2:RandomForestReg
-1.50287169074 BTC_s_T2:Lasso(alpha=1.0
-1.01129501027 BTC_dh_m3:RandomForestReg
-1.00167162664 BTC_dh_m3:Lasso(alpha=1.0
-1.00526912285 BTC_dl_m3:RandomForestReg
-1.00073235397 BTC_dl_m3:Lasso(alpha=1.0
-1.00164740181 BTC_do_m3:RandomForestReg
-1.00238122689 BTC_do_m3:Lasso(alpha=1.0
-1.00571974508 BTC_dp_m3:RandomForestReg
-1.00041327712 BTC_dp_m3:Lasso(alpha=1.0
-1.17915445144 BTC_log_v_T2:RandomForestReg
-1.16959479508 BTC_log_v_T2:Lasso(alpha=1.0
-1.5242905509 BTC_log_s_T2:RandomForestReg
-1.5204661907 BTC_log_s_T2:Lasso(alpha=1.0
-1.01681433113 BTC_cbrt_rpd_T0:RandomForestReg
-1.00502746636 BTC_cbrt_rpd_T0:Lasso(alpha=1.0
-1.01580076386 BTC_cbrt_pd_T1:RandomForestReg
-1.00757264031 BTC_cbrt_pd_T1:Lasso(alpha=1.0
-1.00875193424 BTC_cbrt_dv_T1:RandomForestReg
-1.00127016481 BTC_cbrt_dv_T1:Lasso(alpha=1.0
-1.51169196602 BTC_cbrt_hl_T1:RandomForestReg
-1.51760727246 BTC_cbrt_hl_T1:Lasso(alpha=1.0
-1.0315597114 BTC_cbrt_pd_T2:RandomForestReg
-1.01853785493 BTC_cbrt_pd_T2:Lasso(alpha=1.0
-0.99680604795 BTC_cbrt_dh_m3:RandomForestReg
-1.00100385513 BTC_cbrt_dh_m3:Lasso(alpha=1.0
-1.01059521017 BTC_cbrt_dl_m3:RandomForestReg
-1.00360457359 BTC_cbrt_dl_m3:Lasso(alpha=1.0
-1.00854660849 BTC_cbrt_do_m3:RandomForestReg
-1.00367703909 BTC_cbrt_do_m3:Lasso(alpha=1.0
-1.01929146151 BTC_cbrt_dp_m3:RandomForestReg
-1.00361460398 BTC_cbrt_dp_m3:Lasso(alpha=1.0

In [140]:
ms = cv_score.mean(axis=0)
ms.sort_values(ascending=False)


Out[140]:
BTC_cbrt_dh_m3:RandomForestReg    -0.996806
BTC_dp_m3:Lasso(alpha=1.0         -1.000413
BTC_dl_m3:Lasso(alpha=1.0         -1.000732
BTC_cbrt_dh_m3:Lasso(alpha=1.0    -1.001004
BTC_cbrt_dv_T1:Lasso(alpha=1.0    -1.001270
BTC_do_m3:RandomForestReg         -1.001647
BTC_dh_m3:Lasso(alpha=1.0         -1.001672
BTC_do_m3:Lasso(alpha=1.0         -1.002381
BTC_cbrt_dl_m3:Lasso(alpha=1.0    -1.003605
BTC_cbrt_dp_m3:Lasso(alpha=1.0    -1.003615
BTC_cbrt_do_m3:Lasso(alpha=1.0    -1.003677
BTC_pd_T0:Lasso(alpha=1.0         -1.004898
BTC_cbrt_rpd_T0:Lasso(alpha=1.0   -1.005027
BTC_dl_m3:RandomForestReg         -1.005269
BTC_dp_m3:RandomForestReg         -1.005720
BTC_rpd_T0:Lasso(alpha=1.0        -1.007003
BTC_cbrt_pd_T1:Lasso(alpha=1.0    -1.007573
BTC_cbrt_do_m3:RandomForestReg    -1.008547
BTC_cbrt_dv_T1:RandomForestReg    -1.008752
BTC_cbrt_dl_m3:RandomForestReg    -1.010595
BTC_dh_m3:RandomForestReg         -1.011295
BTC_rpd_T0:RandomForestReg        -1.013622
BTC_pd_T0:RandomForestReg         -1.015103
BTC_cbrt_pd_T1:RandomForestReg    -1.015801
BTC_cbrt_rpd_T0:RandomForestReg   -1.016814
BTC_cbrt_pd_T2:Lasso(alpha=1.0    -1.018538
BTC_cbrt_dp_m3:RandomForestReg    -1.019291
BTC_rpd_T1:Lasso(alpha=1.0        -1.021596
BTC_pd_T2:Lasso(alpha=1.0         -1.024779
BTC_rpd_T1:RandomForestReg        -1.027990
BTC_cbrt_pd_T2:RandomForestReg    -1.031560
BTC_pd_T2:RandomForestReg         -1.034273
BTC_v_T0:RandomForestReg          -1.121170
BTC_v_T0:Lasso(alpha=1.0          -1.123333
BTC_v_T2:Lasso(alpha=1.0          -1.125440
BTC_v_T2:RandomForestReg          -1.132609
BTC_v_T1:Lasso(alpha=1.0          -1.142576
BTC_v_T1:RandomForestReg          -1.143296
BTC_log_v_T2:Lasso(alpha=1.0      -1.169595
BTC_log_v_T2:RandomForestReg      -1.179154
BTC_s_T2:Lasso(alpha=1.0          -1.502872
BTC_s_T2:RandomForestReg          -1.510048
BTC_cbrt_hl_T1:RandomForestReg    -1.511692
BTC_cbrt_hl_T1:Lasso(alpha=1.0    -1.517607
BTC_log_s_T2:Lasso(alpha=1.0      -1.520466
BTC_log_s_T2:RandomForestReg      -1.524291
dtype: float64

In [146]:
indicator = 'BTC_cbrt_dv_T1:Lasso(alpha=1.0'
indicator = indicator.split(":")[0]
Y = Y_all[indicator]
reg = XGBRegressor(n_estimators=100)
reg.fit(X_all, Y)
Y_t = reg.predict(X_all)
error = abs(Y - Y_t)

In [147]:
error.hist()


Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f2af8d0>

In [148]:
# DROP THE BULL$HIT
itruth = error < error.quantile(0.3)
X = X_all[itruth]
Y = Y_all[indicator][itruth]
G = groups[itruth]

In [149]:
reg = XGBRegressor(n_estimators=100, max_depth=8)
scores = cross_val_score(reg, X, Y, cv=4, groups=G, scoring='neg_mean_squared_error')
print sorted(scores)


[-0.23556982483024885, -0.21678759213304766, -0.18345071771185228, -0.16639456674108286]

In [150]:
ax = groups.hist(figsize=(12,5))
G.hist(ax=ax)


Out[150]:
<matplotlib.axes._subplots.AxesSubplot at 0x15829a550>

In [ ]:


In [163]:
reg = XGBRegressor(n_estimators=100, max_depth=8)
reg.fit(X,Y)
Y_ = reg.predict(X)
truth_df = pd.DataFrame({'date': G, 'Y': Y_})

In [164]:



Out[164]:
date
2017-05-11   -0.207513
2017-05-12   -0.006514
2017-05-13   -0.295262
2017-05-15   -0.207278
2017-05-17   -0.089345
2017-05-19   -0.348372
2017-05-20    0.100844
2017-05-28   -0.578722
2017-05-30   -0.103358
2017-05-31   -0.336672
2017-06-01   -0.411269
2017-06-02   -0.333014
2017-06-03   -0.400911
2017-06-07   -0.377851
2017-06-08   -0.376306
2017-06-10   -0.473987
2017-06-16   -0.434028
2017-06-17   -0.439826
2017-06-18   -0.583273
2017-06-20   -0.664884
2017-06-22   -0.302429
dtype: float32

In [157]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

In [160]:
ax = targets.BTC_cbrt_dv_T1.plot()
truth.plot(ax=ax)


Out[160]:
<matplotlib.axes._subplots.AxesSubplot at 0x152f8add0>

In [162]:
truth


Out[162]:
Y
date
2015-09-11 -0.259642
2015-09-28 -0.383556
2015-10-06 -0.554653
2015-10-07 -0.604275
2015-10-09 -0.145151
2015-11-05 -0.914925
2015-11-07 -0.438083
2015-11-17 -0.642703
2015-11-19 1.160174
2015-11-30 -0.579157
2015-12-01 -0.614836
2015-12-07 -0.644124
2015-12-09 -1.126319
2015-12-20 0.481569
2015-12-22 -0.221396
2015-12-27 0.479235
2015-12-28 0.577206
2015-12-29 -0.232814
2015-12-30 -0.247279
2016-01-01 -0.499129
2016-01-13 -0.847905
2016-01-17 0.654959
2016-01-18 -0.292878
2016-02-01 -0.681545
2016-02-11 -0.442532
2016-02-17 -0.505129
2016-02-24 -0.522381
2016-02-26 -0.428262
2016-03-06 -0.100507
2016-03-07 -0.400285
... ...
2017-04-26 0.656895
2017-04-28 -0.823393
2017-04-29 -0.028362
2017-05-01 -0.747825
2017-05-02 0.864450
2017-05-04 -0.562096
2017-05-06 0.969665
2017-05-07 1.318211
2017-05-09 -0.804967
2017-05-11 0.411013
2017-05-12 -0.765666
2017-05-13 -0.720597
2017-05-15 -0.331383
2017-05-17 -0.641264
2017-05-19 -0.420860
2017-05-20 -0.485505
2017-05-28 -0.765134
2017-05-30 -0.259362
2017-05-31 0.735030
2017-06-01 -0.850930
2017-06-02 -0.497385
2017-06-03 -0.455789
2017-06-07 -0.695467
2017-06-08 -0.698542
2017-06-10 -0.250419
2017-06-16 -0.638425
2017-06-17 -0.297636
2017-06-18 -0.451001
2017-06-20 0.741618
2017-06-22 -0.648972

252 rows × 1 columns


In [188]:
def drop_bs(indicator, q=0.3):
    Y = Y_all[indicator]
    reg = XGBRegressor(n_estimators=100)
    reg.fit(X_all, Y)
    Y_t = reg.predict(X_all)
    error = abs(Y - Y_t)
    error.hist()
    itruth = error < error.quantile(q)
    X = X_all[itruth]
    Y = Y_all[indicator][itruth]
    G = groups[itruth]
    reg = XGBRegressor(n_estimators=30, max_depth=5)
    scores = cross_val_score(reg, X, Y, cv=4, groups=G, scoring='neg_mean_squared_error')
    print sorted(scores)
    print "MEAN CV SCORE: ", np.mean(scores)
    
    reg = XGBRegressor(n_estimators=100, max_depth=8)
    reg.fit(X,Y)
    Y_ = reg.predict(X)
    agg = pd.Series(Y_).groupby(G)
    truthscore = agg.mean()
    impact_count = agg.count()
    truth_max = agg.max()
    return pd.DataFrame(dict(truthscore=truthscore, impact_count=impact_count, truth_max=truth_max, date=truthscore.index))

In [189]:
dv = drop_bs('BTC_cbrt_dv_T1', 0.4)


[-0.41367598429198782, -0.3770965620906167, -0.35208172797206533, -0.33427650602529679]
MEAN CV SCORE:  -0.369282695095

In [195]:
import json
def to_json(df, path):
    a = []
    for i,d in list(df.iterrows()):
        d = d.to_dict()
        d['date'] = str(d['date'])
        a.append(d) 
    with open(path, 'w') as f:
        json.dump(a, f)
        
to_json(dv, '../bitcoin-daily-bars/out-truth-volume.json')

In [ ]:


In [205]:
impactfull = cc.ix[itruth.index][itruth]

In [207]:
impactfull.head()


Out[207]:
Actions.Agree.Involved Actions.Agree.Total Actions.Comment.Involved Actions.Comment.NextLoad Actions.Comment.Total Actions.Disagree.Involved Actions.Disagree.Total Actions.Report.Involved Actions.Report.Total Body Cryptopian.Avatar Cryptopian.Id Cryptopian.Name Edited Id Notification Sticky ThreadId Timestamp
31 False 0 False 0 2 False 0 False 0 Any chances of the prices going down?? https://images.cryptocompare.com/87220/1e77675... 87220 shivam.truth False 68894 False False 1182 2017-06-22 10:51:44.727
38 False 1 False 3 9 False 0 False 0 how you experts know that btc will reach 3.000... https://images.cryptocompare.com/88732/9cb05f0... 88732 wdkml82 False 68062 False False 1182 2017-06-21 19:31:14.737
39 False 2 False 0 6 False 0 False 0 the end of month BTC price will be $3000-$3500 https://images.cryptocompare.com/132924/f69f8a... 132924 smarthamster_72 False 68034 False False 1182 2017-06-21 18:52:20.900
40 False 0 False 0 3 False 0 False 0 why do people think BTC price can reach $3000 ... https://images.cryptocompare.com/71410/2e5b5be... 71410 sahiaman False 67995 False False 1182 2017-06-21 18:09:30.150
41 False 0 False 0 2 False 0 False 0 Lucky one :(<br/>http://www.cnbc.com/2017/06/2... https://images.cryptocompare.com/71246/2c9bb51... 71246 ratan.shiv False 67977 False False 1182 2017-06-21 17:56:44.017

In [232]:
f = 'Cryptopian.Name'
a = impactfull.groupby(f).size()
b = cc.groupby(f).size()
c = pd.DataFrame(dict(a=a,b=b))
c = c[c.a>1]
c['impact'] = c.a/c.b
c.sort_values('impact', ascending=False)


Out[232]:
a b impact
exportech 2.0 2 1.000000
PtahX 3.0 3 1.000000
35514CC0 3.0 3 1.000000
francisco_averos 3.0 3 1.000000
lion80002 2.0 2 1.000000
leedelouche 2.0 2 1.000000
notNowPlease 3.0 3 1.000000
iyer.sumit 2.0 2 1.000000
noimeereyes 2.0 2 1.000000
bitearner 2.0 2 1.000000
bitdeal 2.0 2 1.000000
bernycheam 2.0 2 1.000000
mybtc92 2.0 2 1.000000
amar32 2.0 2 1.000000
emmanuel.joyy 4.0 4 1.000000
seyamabdulla 3.0 3 1.000000
choudharysameem 6.0 8 0.750000
Gbzl 3.0 4 0.750000
Ethetheth 2.0 3 0.666667
lorenzo.tommasini 2.0 3 0.666667
coin.worxs.4us 2.0 3 0.666667
cryptoman 2.0 3 0.666667
joerx1212 2.0 3 0.666667
tyoumich 2.0 3 0.666667
justaddbitcoin 2.0 3 0.666667
ritthisakdi 2.0 3 0.666667
inggrid.yonata 2.0 3 0.666667
Hefner88 2.0 3 0.666667
theking.eo 4.0 7 0.571429
crrockin 10.0 18 0.555556
... ... ... ...
petmine595 4.0 12 0.333333
jsngpaw 92.0 284 0.323944
berndi75 9.0 28 0.321429
zephyrcat229 7.0 23 0.304348
Mng87 38.0 125 0.304000
RzeroD 6.0 20 0.300000
ZeroCool86 36.0 120 0.300000
webdangler 2.0 7 0.285714
pauldobrero 2.0 7 0.285714
tonyukuk 4.0 14 0.285714
Shila 4.0 14 0.285714
cleanedup 7.0 25 0.280000
cenkg 22.0 80 0.275000
CEX.IO 3.0 11 0.272727
ocmone 6.0 22 0.272727
coinman22 3.0 11 0.272727
csseyah 5.0 19 0.263158
Kanopt 5.0 19 0.263158
najzzz4u 16.0 62 0.258065
jelko 9.0 35 0.257143
konemota 3.0 12 0.250000
cryptocurrency64 2.0 8 0.250000
nowacki86 2.0 8 0.250000
100ydDASH 4.0 16 0.250000
jehzlau 28.0 123 0.227642
creative_bih 22.0 105 0.209524
newworldchaos 8.0 40 0.200000
Dmitryy 2.0 12 0.166667
Bakcraker 2.0 12 0.166667
Mous 2.0 14 0.142857

98 rows × 3 columns


In [225]:


In [ ]:


In [240]:
dv.truthscore.plot()


Out[240]:
<matplotlib.axes._subplots.AxesSubplot at 0x1594690d0>

In [ ]:
target_sc