In this notebook I add additional features to my dataframe that help explain the variation in the price of BTC. Furthermore, multiple regression will control for these features with regards to their effect on BTC.
You will see below taht I replace a lot of the data I initially obtained via quandl has been replaced from a website called blockchain.info
In [38]:
from sklearn.model_selection import train_test_split
%run helper_functions.py
%run filters.py
%run plotly_functions.py
%run master_func.py
%run btc_info_df.py
plt.style.use('fivethirtyeight')
%autosave 120
import quandl
from datetime import date
from tabulate import tabulate
from collections import Counter
from IPython.display import Image
import math
import string
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,10)
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["axes.labelsize"] = 20
plt.rcParams['legend.fontsize'] = 20
plt.style.use('fivethirtyeight')
pd.set_option('display.max_colwidth', -1)
import plotly.plotly as py
import plotly.graph_objs as go
import spacy
nlp = spacy.load("en")
nltk_stopwords = stopwords.words("english")+["rt", "via","-»","--»","--","---","-->","<--","->","<-","«--","«","«-","»","«»", " →", "→"]
punc = '#!"%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
In this notebook, I will get additional features from blockchain.info.
I came across blockchain.info that has a PLETHORA of data regarding bitcoin!
As such, I will be getting all my information from here.
In [2]:
modelling_df = unpickle_object("modelling_df_V1.pkl")
modelling_df.head()
Out[2]:
The above dataframe represents where we started. I will be replacing volume and weighted price moving forward.
I will also be dropping open, high, low and close in addition to compound_sent which represents a univariate metric for sentiment, whereas I am convered with a multivariate metrics.
In [3]:
modelling_df.drop(["Close","High", "Low", "Open", "compound_sent", "Volume (Currency)", "Weighted Price"], axis=1, inplace=True)
In [4]:
modelling_df.head()
Out[4]:
In [5]:
tot_num_trans = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/total_num_trans_per_day_btc.csv", header=None)
subset_tot_num_trans = clean_blockchain_csv(tot_num_trans, ["date", "tot_num_trans"])
df1 = pd.merge(modelling_df, subset_tot_num_trans, on='date', how="outer")
df1.head()
Out[5]:
In [6]:
num_unique_addr = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/unique_address_btc.csv", header=None)
subset_num_unique_addr = clean_blockchain_csv(num_unique_addr, ['date', "unique_addr"])
df2 = pd.merge(df1, subset_num_unique_addr, on='date', how="outer")
df2.head()
Out[6]:
In [7]:
mkt_cap = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/market_cap_btc.csv", header=None)
subset_mkt_cap = clean_blockchain_csv(mkt_cap, ['date', "mkt_cap"])
df3 = pd.merge(df2, subset_mkt_cap, on='date', how="outer")
df3.head()
Out[7]:
In [8]:
hash_rate = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/hash_rate_btc.csv", header=None)
subset_hash_rate = clean_blockchain_csv(hash_rate, ['date', "hash_rate"])
df4 = pd.merge(df3, subset_hash_rate, on='date', how="outer")
df4.head()
Out[8]:
In [9]:
mempool_trans = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/mempool_trans_count_btc.csv", header=None)
subset_mempool_trans = clean_blockchain_csv(mempool_trans, ['date', "mempool_trans"])
subset_mempool_trans['date'] = subset_mempool_trans['date'].apply(lambda x: x.date())
subset_mempool_trans = subset_mempool_trans.groupby("date").sum().reset_index()
del subset_mempool_trans['date']
df5 = pd.concat([df4, subset_mempool_trans], axis=1) #couldnt merge for some reason
df5.head()
Out[9]:
In [10]:
est_USD_tans_val = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/estimated-transaction-volume-usd.csv", header=None)
subset_est_USD_tans_val = clean_blockchain_csv(est_USD_tans_val, ['date', "USD_trans_val"])
df6 = pd.merge(df5, subset_est_USD_tans_val, on='date', how="outer")
df6.head()
Out[10]:
In [60]:
mkt_price = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/blockchain.info/market_price_btc.csv", header=None)
subset_mkt_price = clean_blockchain_csv(mkt_price, ['date', "mkt_price"])
df7 = pd.merge(df6, subset_mkt_price, on="date", how="outer")
df7.head()
Out[60]:
In [62]:
dates_lst = df7['date']
df7.head()
Out[62]:
In [14]:
df7.drop(["date"],axis=1, inplace=True)
features = "+".join(df7.columns[:-1])
y, X = dmatrices('mkt_price ~ ' + features, df7, return_type='dataframe')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1) #looks like we are doing great!
Out[14]:
In [15]:
df7.corr()
Out[15]:
We can see from above that mkt_price and mkt_cap are perfectly collinear. We will drop mkt_cap. We will also drop neu_sent as it has an extremely large VIF.
In [16]:
df7.drop(["neu_sent", "mkt_cap"],axis=1, inplace=True)
features = "+".join(df7.columns[:-1])
y, X = dmatrices('mkt_price ~ ' + features, df7, return_type='dataframe')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1) #looks like we are doing great!
Out[16]:
In [17]:
df7.corr()
Out[17]:
In [18]:
plot_corr_matrix(df7)
In [19]:
df7.head()
Out[19]:
In [21]:
df7.shape
Out[21]:
In [33]:
# pickle_object(df7, "blockchain_info_df")
In [34]:
data = unpickle_object("blockchain_info_df.pkl")
In [63]:
data['date'] = dates_lst
In [64]:
percentage_missing(data)
In [66]:
data.set_index('date', inplace=True)
data.head()
Out[66]: