In this notebook, I will create the overall dataframe needed to start our modelling process.
NOTE: The data used for BTC prices was later updated by using data from blockchain.info instead of the quandl API.
Furthermore, in later notebooks, I add additional data from blockchain.info that may not be contained in the code below.
In [95]:
%run helper_functions.py
%run filters.py
%run plotly_functions.py
import quandl
from datetime import date
from tabulate import tabulate
from collections import Counter
from IPython.display import Image
import math
import string
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,10)
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["axes.labelsize"] = 20
plt.rcParams['legend.fontsize'] = 20
plt.style.use('fivethirtyeight')
pd.set_option('display.max_colwidth', -1)
import plotly.plotly as py
import plotly.graph_objs as go
import spacy
nlp = spacy.load("en")
nltk_stopwords = stopwords.words("english")+["rt", "via","-»","--»","--","---","-->","<--","->","<-","«--","«","«-","»","«»", " →", "→"]
punc = '#!"%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
In [2]:
tweet_df = unpickle_object("clean_df_NB3_Complete.pkl")
tweet_df.sort_values(by=["date", "hour_of_day"], ascending=False, inplace=True)
tweet_df.reset_index(inplace=True)
del tweet_df['index']
tweet_df['date'] = pd.to_datetime(tweet_df['date'])
tweet_df.head()
Out[2]:
In [3]:
tweet_df.shape
Out[3]:
In [4]:
bitcoin_data = quandl.get("BCHARTS/BITSTAMPUSD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
bitcoin_data.drop("Volume (BTC)", axis=1, inplace=True)
bitcoin_data.reset_index(inplace=True)
bitcoin_data.rename(columns={"Date":"date"}, inplace=True)
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'])
bitcoin_data.sort_values(by="date", ascending=False, inplace=True)
bitcoin_data.reset_index(inplace=True)
del bitcoin_data['index']
bitcoin_data.head()
Out[4]:
In [5]:
bitcoin_data.shape
Out[5]:
In [6]:
gold_data = quandl.get("LBMA/GOLD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
gold_data.drop(["USD (PM)", "GBP (AM)", "GBP (PM)", "EURO (AM)", "EURO (PM)"],axis=1, inplace=True)
gold_data.reset_index(inplace=True)
gold_data.rename(columns={"USD (AM)":"gold_price", "Date":"date"}, inplace=True)
gold_data['date'] = pd.to_datetime(gold_data['date'])
gold_data.sort_values(by="date", ascending=False, inplace=True)
gold_data.reset_index(inplace=True)
del gold_data['index']
gold_data.head()
Out[6]:
In [7]:
gold_data.shape
Out[7]:
In [8]:
eth_data = pd.read_csv("/Users/ibrahimgabr/Downloads/project-5/Data/ethereum_price.csv")
eth_data.rename(columns={"Date":"date"}, inplace=True)
eth_data['date'] = pd.to_datetime(eth_data['date'])
eth_data.head()
eth_data['eth_price'] = (eth_data['Open'] + eth_data['Close'])/2 #weighted price.
mask = (eth_data['date'] > "2016-10-25") & (eth_data['date'] <= "2017-02-22")
eth_data = eth_data[mask]
eth_data.reset_index(inplace=True)
del eth_data['index']
eth_data.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap'], axis=1, inplace=True)
eth_data.head()
Out[8]:
In [9]:
eth_data.head()
Out[9]:
In [11]:
eth_data.shape
Out[11]:
In [12]:
tweet_and_gold = pd.merge(tweet_df, gold_data, on='date', how="outer")
tweet_and_gold.head()
Out[12]:
In [14]:
final_dummy_df = pd.merge(tweet_and_gold, eth_data, on='date', how='outer')
In [15]:
final_dummy_df.head()
Out[15]:
In [22]:
lemmatimzed_tweets = []
for i in range(final_dummy_df.shape[0]):
tweet_text = final_dummy_df.iloc[i,1]
tokenized = nlp(tweet_text)
whole_tweet = []
for token in tokenized:
if token.is_space:
continue
elif token.is_punct:
continue
elif token.text in nltk_stopwords:
continue
elif token.text in punc:
continue
elif token.is_stop:
continue
elif token.is_digit:
continue
else:
whole_tweet.append(token.lemma_)
tweet = " ".join(whole_tweet)
lemmatimzed_tweets.append(tweet)
In [23]:
for i in lemmatimzed_tweets[:5]:
print(i)
print()
In [24]:
final_dummy_df['lemmatized_tweets'] = lemmatimzed_tweets
In [25]:
final_dummy_df.head()
Out[25]:
In [26]:
pickle_object(final_dummy_df, "final_dummy_df_V1")
In [ ]:
In [2]:
final_df = unpickle_object("final_dummy_df_V1.pkl")
In [3]:
final_df.head()
Out[3]:
In [4]:
final_df['hour_of_day'].
Out[4]:
The code block below calculates the sentiment for every tweet in out dataframe.
I use the VADER sentiment library! Documentation here: https://github.com/cjhutto/vaderSentiment
In [8]:
sentiment = SentimentIntensityAnalyzer()
In [12]:
positive_sentiment = []
negative_sentiment = []
neutral_sentiment = []
compound_sentiment = []
for i in range(final_df.shape[0]):
sent_dict = sentiment.polarity_scores(final_df.iloc[i, 5])
positive_sentiment.append(sent_dict['pos'])
negative_sentiment.append(sent_dict['neg'])
neutral_sentiment.append(sent_dict['neu'])
compound_sentiment.append(sent_dict['compound'])
In [15]:
final_df['pos_sent'] = positive_sentiment
final_df['neg_sent'] = negative_sentiment
final_df['neu_sent'] = neutral_sentiment
final_df['compound_sent'] = compound_sentiment
In [16]:
final_df.head()
Out[16]:
In [18]:
final_df.loc[:, ["pos_sent", "neg_sent", "neu_sent", "compound_sent"]].corr()
Out[18]:
In [104]:
percentage_missing(final_df)
Let's now merge our bitcoin data!
In [138]:
bitcoin_data = quandl.get("BCHARTS/BITSTAMPUSD", authtoken="Xyrsw1vVBaJx1p1z9dQ5", start_date="2016-10-26", end_date="2017-02-22")
bitcoin_data.drop("Volume (BTC)", axis=1, inplace=True)
bitcoin_data.reset_index(inplace=True)
bitcoin_data.rename(columns={"Date":"date"}, inplace=True)
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'])
bitcoin_data.sort_values(by="date", ascending=False, inplace=True)
bitcoin_data.reset_index(inplace=True)
del bitcoin_data['index']
bitcoin_data.head()
Out[138]:
In [140]:
complete_df = pd.merge(final_df, bitcoin_data, on='date', how='outer')
In [142]:
complete_df.head()
Out[142]:
In [5]:
features = ["gold_price", "eth_price", "pos_sent", "neg_sent", "neu_sent", "compound_sent", "Open", "High", "Low", "Close", "Volume (Currency)", "Weighted Price"]
In [13]:
modelling_df = complete_df.groupby("date").mean()[features]
modelling_df.reset_index(inplace=True)
modelling_df.sort_values(by='date', inplace=True)
In [14]:
modelling_df.head()
Out[14]:
In [149]:
# pickle_object(complete_df, "final_dummy_df_V2")
In [161]:
# pickle_object(modelling_df, "modelling_df_V1")
In [ ]: