Notebook 4

In this notebook, I will create the overall dataframe needed to start our modelling process.

NOTE: The data used for BTC prices was later updated by using data from blockchain.info instead of the quandl API.

Furthermore, in later notebooks, I add additional data from blockchain.info that may not be contained in the code below.



In [95]:

    
%run helper_functions.py
%run filters.py
%run plotly_functions.py
import quandl
from datetime import date
from tabulate import tabulate
from collections import Counter
from IPython.display import Image
import math
import string
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,10)
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["axes.labelsize"] = 20
plt.rcParams['legend.fontsize'] = 20
plt.style.use('fivethirtyeight')
pd.set_option('display.max_colwidth', -1)
import plotly.plotly as py
import plotly.graph_objs as go
import spacy
nlp = spacy.load("en")
nltk_stopwords = stopwords.words("english")+["rt", "via","-»","--»","--","---","-->","<--","->","<-","«--","«","«-","»","«»", " →", "→"]
punc = '#!"%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet



In [2]:

    
tweet_df = unpickle_object("clean_df_NB3_Complete.pkl")
tweet_df.sort_values(by=["date", "hour_of_day"], ascending=False, inplace=True)
tweet_df.reset_index(inplace=True)
del tweet_df['index']
tweet_df['date'] = pd.to_datetime(tweet_df['date'])  
tweet_df.head()









    Out[2]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24



In [3]:

    
tweet_df.shape









    Out[3]:





(531044, 3)



In [4]:

    
bitcoin_data = quandl.get("BCHARTS/BITSTAMPUSD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
bitcoin_data.drop("Volume (BTC)", axis=1, inplace=True)
bitcoin_data.reset_index(inplace=True)
bitcoin_data.rename(columns={"Date":"date"}, inplace=True)
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'])
bitcoin_data.sort_values(by="date", ascending=False, inplace=True)
bitcoin_data.reset_index(inplace=True)
del bitcoin_data['index']
bitcoin_data.head()









    Out[4]:







  
    
      
      date
      Open
      High
      Low
      Close
      Volume (Currency)
      Weighted Price
    
  
  
    
      0
      2017-02-22
      1124.83
      1143.00
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      1
      2017-02-21
      1084.00
      1126.86
      1077.00
      1124.62
      9.273663e+06
      1104.602729
    
    
      2
      2017-02-20
      1051.80
      1089.99
      1044.39
      1084.00
      3.902660e+06
      1065.027939
    
    
      3
      2017-02-19
      1056.40
      1061.90
      1039.07
      1051.80
      2.388775e+06
      1051.930117
    
    
      4
      2017-02-18
      1055.75
      1068.99
      1046.14
      1056.40
      4.745832e+06
      1060.533044



In [5]:

    
bitcoin_data.shape









    Out[5]:





(120, 7)



In [6]:

    
gold_data = quandl.get("LBMA/GOLD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
gold_data.drop(["USD (PM)", "GBP (AM)", "GBP (PM)", "EURO (AM)", "EURO (PM)"],axis=1, inplace=True)
gold_data.reset_index(inplace=True)
gold_data.rename(columns={"USD (AM)":"gold_price", "Date":"date"}, inplace=True)
gold_data['date'] = pd.to_datetime(gold_data['date'])
gold_data.sort_values(by="date", ascending=False, inplace=True)
gold_data.reset_index(inplace=True)
del gold_data['index']
gold_data.head()



In [7]:

    
gold_data.shape









    Out[7]:





(83, 2)



In [8]:

    
eth_data = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/ethereum_price.csv")
eth_data.rename(columns={"Date":"date"}, inplace=True)
eth_data['date'] = pd.to_datetime(eth_data['date'])
eth_data.head()
eth_data['eth_price'] = (eth_data['Open'] + eth_data['Close'])/2 #weighted price.
mask = (eth_data['date'] > "2016-10-25") & (eth_data['date'] <= "2017-02-22")
eth_data = eth_data[mask]
eth_data.reset_index(inplace=True)
del eth_data['index']
eth_data.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap'], axis=1, inplace=True)
eth_data.head()



In [9]:

    
eth_data.head()



In [11]:

    
eth_data.shape









    Out[11]:





(120, 2)



In [12]:

    
tweet_and_gold = pd.merge(tweet_df, gold_data, on='date', how="outer")
tweet_and_gold.head()









    Out[12]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5



In [14]:

    
final_dummy_df = pd.merge(tweet_and_gold, eth_data, on='date', how='outer')



In [15]:

    
final_dummy_df.head()









    Out[15]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
      eth_price
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
      12.645
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
      12.645
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
      12.645
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
      12.645
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5
      12.645

Lemmatization

The code block below lemmatizes the tweets using the spaCy package!

To lemmatize a word is to take it back to its root!

i.e. {dancing, danced, dances} --> dance.



In [22]:

    
lemmatimzed_tweets = []
for i in range(final_dummy_df.shape[0]):
    tweet_text = final_dummy_df.iloc[i,1]
    
    tokenized = nlp(tweet_text)
    whole_tweet = []
    
    for token in tokenized:
        if token.is_space:
            continue
        elif token.is_punct:
            continue
        elif token.text in nltk_stopwords:
            continue
        elif token.text in punc:
            continue
        elif token.is_stop:
            continue
        elif token.is_digit:
            continue
        else:
            whole_tweet.append(token.lemma_)
    tweet = " ".join(whole_tweet)
    
    lemmatimzed_tweets.append(tweet)



In [23]:

    
for i in lemmatimzed_tweets[:5]:
    print(i)
    print()









    



tennis kevinanderson vs juanmartindelpotro start bitcoin match

lawmaker california propose prohibit use bitcoin digital currency charitable

bitcoin price hit sixweek high trader await etf decision

judicious buy bitcoin £ btc bpi buy bitcoin banktran

man accidentally make million buying house bitcoin acoin today coin



In [24]:

    
final_dummy_df['lemmatized_tweets'] = lemmatimzed_tweets



In [25]:

    
final_dummy_df.head()









    Out[25]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
      eth_price
      lemmatized_tweets
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
      12.645
      tennis kevinanderson vs juanmartindelpotro start bitcoin match
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
      12.645
      lawmaker california propose prohibit use bitcoin digital currency charitable
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
      12.645
      bitcoin price hit sixweek high trader await etf decision
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
      12.645
      judicious buy bitcoin £ btc bpi buy bitcoin banktran
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5
      12.645
      man accidentally make million buying house bitcoin acoin today coin



In [26]:

    
pickle_object(final_dummy_df, "final_dummy_df_V1")



In [ ]:



In [2]:

    
final_df = unpickle_object("final_dummy_df_V1.pkl")



In [3]:

    
final_df.head()









    Out[3]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
      eth_price
      lemmatized_tweets
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
      12.645
      tennis kevinanderson vs juanmartindelpotro start bitcoin match
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
      12.645
      lawmaker california propose prohibit use bitcoin digital currency charitable
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
      12.645
      bitcoin price hit sixweek high trader await etf decision
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
      12.645
      judicious buy bitcoin £ btc bpi buy bitcoin banktran
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5
      12.645
      man accidentally make million buying house bitcoin acoin today coin



In [4]:

    
final_df['hour_of_day'].









    Out[4]:





24

Sentiment

The code block below calculates the sentiment for every tweet in out dataframe.

I use the VADER sentiment library! Documentation here: https://github.com/cjhutto/vaderSentiment



In [8]:

    
sentiment = SentimentIntensityAnalyzer()



In [12]:

    
positive_sentiment = []
negative_sentiment = []
neutral_sentiment = []
compound_sentiment = []

for i in range(final_df.shape[0]):
    sent_dict = sentiment.polarity_scores(final_df.iloc[i, 5])
    
    positive_sentiment.append(sent_dict['pos'])
    negative_sentiment.append(sent_dict['neg'])
    neutral_sentiment.append(sent_dict['neu'])
    compound_sentiment.append(sent_dict['compound'])



In [15]:

    
final_df['pos_sent'] = positive_sentiment
final_df['neg_sent'] = negative_sentiment
final_df['neu_sent'] = neutral_sentiment
final_df['compound_sent'] = compound_sentiment



In [16]:

    
final_df.head()









    Out[16]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
      eth_price
      lemmatized_tweets
      pos_sent
      neg_sent
      neu_sent
      compound_sent
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
      12.645
      tennis kevinanderson vs juanmartindelpotro start bitcoin match
      0.000
      0.000
      1.000
      0.0000
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
      12.645
      lawmaker california propose prohibit use bitcoin digital currency charitable
      0.252
      0.000
      0.748
      0.4019
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
      12.645
      bitcoin price hit sixweek high trader await etf decision
      0.149
      0.000
      0.851
      0.1027
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
      12.645
      judicious buy bitcoin £ btc bpi buy bitcoin banktran
      0.000
      0.000
      1.000
      0.0000
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5
      12.645
      man accidentally make million buying house bitcoin acoin today coin
      0.000
      0.211
      0.789
      -0.3400



In [18]:

    
final_df.loc[:, ["pos_sent", "neg_sent", "neu_sent", "compound_sent"]].corr()









    Out[18]:







  
    
      
      pos_sent
      neg_sent
      neu_sent
      compound_sent
    
  
  
    
      pos_sent
      1.000000
      -0.143267
      -0.775628
      0.783662
    
    
      neg_sent
      -0.143267
      1.000000
      -0.511198
      -0.635833
    
    
      neu_sent
      -0.775628
      -0.511198
      1.000000
      -0.275164
    
    
      compound_sent
      0.783662
      -0.635833
      -0.275164
      1.000000



In [104]:

    
percentage_missing(final_df)

Let's now merge our bitcoin data!



In [138]:

    
bitcoin_data = quandl.get("BCHARTS/BITSTAMPUSD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
bitcoin_data.drop("Volume (BTC)", axis=1, inplace=True)
bitcoin_data.reset_index(inplace=True)
bitcoin_data.rename(columns={"Date":"date"}, inplace=True)
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'])
bitcoin_data.sort_values(by="date", ascending=False, inplace=True)
bitcoin_data.reset_index(inplace=True)
del bitcoin_data['index']
bitcoin_data.head()









    Out[138]:







  
    
      
      date
      Open
      High
      Low
      Close
      Volume (Currency)
      Weighted Price
    
  
  
    
      0
      2017-02-22
      1124.83
      1143.00
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      1
      2017-02-21
      1084.00
      1126.86
      1077.00
      1124.62
      9.273663e+06
      1104.602729
    
    
      2
      2017-02-20
      1051.80
      1089.99
      1044.39
      1084.00
      3.902660e+06
      1065.027939
    
    
      3
      2017-02-19
      1056.40
      1061.90
      1039.07
      1051.80
      2.388775e+06
      1051.930117
    
    
      4
      2017-02-18
      1055.75
      1068.99
      1046.14
      1056.40
      4.745832e+06
      1060.533044



In [140]:

    
complete_df = pd.merge(final_df, bitcoin_data, on='date', how='outer')



In [142]:

    
complete_df.head()









    Out[142]:







  
    
      
      date
      clean_tweet_V2
      hour_of_day
      gold_price
      eth_price
      lemmatized_tweets
      pos_sent
      neg_sent
      neu_sent
      compound_sent
      Open
      High
      Low
      Close
      Volume (Currency)
      Weighted Price
    
  
  
    
      0
      2017-02-22
      tennis kevinanderson vs juanmartindelpotro starting bitcoin matched
      24
      1237.5
      12.645
      tennis kevinanderson vs juanmartindelpotro start bitcoin match
      0.000
      0.000
      1.000
      0.0000
      1124.83
      1143.0
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      1
      2017-02-22
      lawmaker california proposed prohibiting use bitcoin digital currencies charitable
      24
      1237.5
      12.645
      lawmaker california propose prohibit use bitcoin digital currency charitable
      0.252
      0.000
      0.748
      0.4019
      1124.83
      1143.0
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      2
      2017-02-22
      bitcoin prices hit sixweek high traders await etf decision by
      24
      1237.5
      12.645
      bitcoin price hit sixweek high trader await etf decision
      0.149
      0.000
      0.851
      0.1027
      1124.83
      1143.0
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      3
      2017-02-22
      judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans
      24
      1237.5
      12.645
      judicious buy bitcoin £ btc bpi buy bitcoin banktran
      0.000
      0.000
      1.000
      0.0000
      1124.83
      1143.0
      1098.16
      1130.01
      1.314673e+07
      1126.242226
    
    
      4
      2017-02-22
      man accidentally makes million buying house bitcoin acoin today coin
      24
      1237.5
      12.645
      man accidentally make million buying house bitcoin acoin today coin
      0.000
      0.211
      0.789
      -0.3400
      1124.83
      1143.0
      1098.16
      1130.01
      1.314673e+07
      1126.242226



In [5]:

    
features = ["gold_price", "eth_price", "pos_sent", "neg_sent", "neu_sent", "compound_sent", "Open", "High", "Low", "Close", "Volume (Currency)", "Weighted Price"]



In [13]:

    
modelling_df = complete_df.groupby("date").mean()[features]
modelling_df.reset_index(inplace=True)
modelling_df.sort_values(by='date', inplace=True)



In [14]:

    
modelling_df.head()









    Out[14]:







  
    
      
      date
      gold_price
      eth_price
      pos_sent
      neg_sent
      neu_sent
      compound_sent
      Open
      High
      Low
      Close
      Volume (Currency)
      Weighted Price
    
  
  
    
      0
      2016-10-26
      1273.9
      11.455
      0.143706
      0.066297
      0.790006
      0.115130
      651.97
      677.00
      651.69
      675.11
      4.466380e+06
      665.012877
    
    
      1
      2016-10-27
      1269.3
      11.515
      0.127429
      0.064310
      0.808264
      0.093255
      674.63
      685.89
      665.59
      684.71
      3.090090e+06
      679.822527
    
    
      2
      2016-10-28
      1265.9
      11.280
      0.128794
      0.056393
      0.814586
      0.104770
      684.68
      687.23
      677.05
      685.91
      2.446578e+06
      684.534409
    
    
      3
      2016-10-29
      1265.9
      10.770
      0.139114
      0.063177
      0.797478
      0.109344
      685.91
      717.99
      684.89
      711.13
      2.839204e+06
      703.664235
    
    
      4
      2016-10-30
      1265.9
      10.805
      0.131922
      0.064369
      0.803709
      0.093237
      711.10
      711.78
      691.49
      697.23
      2.630803e+06
      701.075287



In [149]:

    
# pickle_object(complete_df, "final_dummy_df_V2")



In [161]:

    
# pickle_object(modelling_df, "modelling_df_V1")



In [ ]:

	date	gold_price
0	2017-02-22	1237.50
1	2017-02-21	1228.70
2	2017-02-20	1235.35
3	2017-02-17	1241.40
4	2017-02-16	1236.75

	date	clean_tweet_V2	hour_of_day
0	2017-02-22	tennis kevinanderson vs juanmartindelpotro starting bitcoin matched	24
1	2017-02-22	lawmaker california proposed prohibiting use bitcoin digital currencies charitable	24
2	2017-02-22	bitcoin prices hit sixweek high traders await etf decision by	24
3	2017-02-22	judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans	24
4	2017-02-22	man accidentally makes million buying house bitcoin acoin today coin	24

	date	Open	High	Low	Close	Volume (Currency)	Weighted Price
0	2017-02-22	1124.83	1143.00	1098.16	1130.01	1.314673e+07	1126.242226
1	2017-02-21	1084.00	1126.86	1077.00	1124.62	9.273663e+06	1104.602729
2	2017-02-20	1051.80	1089.99	1044.39	1084.00	3.902660e+06	1065.027939
3	2017-02-19	1056.40	1061.90	1039.07	1051.80	2.388775e+06	1051.930117
4	2017-02-18	1055.75	1068.99	1046.14	1056.40	4.745832e+06	1060.533044

	date	eth_price
0	2017-02-22	12.645
1	2017-02-21	12.505
2	2017-02-20	12.600
3	2017-02-19	12.790
4	2017-02-18	12.760

	date	clean_tweet_V2	hour_of_day	gold_price
0	2017-02-22	tennis kevinanderson vs juanmartindelpotro starting bitcoin matched	24	1237.5
1	2017-02-22	lawmaker california proposed prohibiting use bitcoin digital currencies charitable	24	1237.5
2	2017-02-22	bitcoin prices hit sixweek high traders await etf decision by	24	1237.5
3	2017-02-22	judicious buy bitcoins £94557 per btc bpi buy bitcoin banktrans	24	1237.5
4	2017-02-22	man accidentally makes million buying house bitcoin acoin today coin	24	1237.5

	pos_sent	neg_sent	neu_sent	compound_sent
pos_sent	1.000000	-0.143267	-0.775628	0.783662
neg_sent	-0.143267	1.000000	-0.511198	-0.635833
neu_sent	-0.775628	-0.511198	1.000000	-0.275164
compound_sent	0.783662	-0.635833	-0.275164	1.000000

	date	gold_price	eth_price	pos_sent	neg_sent	neu_sent	compound_sent	Open	High	Low	Close	Volume (Currency)	Weighted Price
0	2016-10-26	1273.9	11.455	0.143706	0.066297	0.790006	0.115130	651.97	677.00	651.69	675.11	4.466380e+06	665.012877
1	2016-10-27	1269.3	11.515	0.127429	0.064310	0.808264	0.093255	674.63	685.89	665.59	684.71	3.090090e+06	679.822527
2	2016-10-28	1265.9	11.280	0.128794	0.056393	0.814586	0.104770	684.68	687.23	677.05	685.91	2.446578e+06	684.534409
3	2016-10-29	1265.9	10.770	0.139114	0.063177	0.797478	0.109344	685.91	717.99	684.89	711.13	2.839204e+06	703.664235
4	2016-10-30	1265.9	10.805	0.131922	0.064369	0.803709	0.093237	711.10	711.78	691.49	697.23	2.630803e+06	701.075287