Data Exploration

Team: IBM (Itty Bitty Money)

  • Patrick Handley
  • Bhargavi Madhunala
  • Matt Maffa
  • Antonino Tan-Marcello

Date: 11-2017

This notebook is used to explore & visualize our dataset.


In [1]:
import numpy as np # Linear Alg
import pandas as pd # CSV file I/O & data processing

# Visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import warnings 
from matplotlib import style
from matplotlib.finance import candlestick_ohlc

warnings.filterwarnings("ignore")
# style.use('ggplot')

%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 8.0)

from subprocess import check_output


/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The finance module has been deprecated in mpl 2.0 and will be removed in mpl 2.2. Please use the module mpl_finance instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Dataset

We are using the Cryptocurrency Historical Prices dataset from Kaggle.


In [2]:
input_dir = '../input'
print('File List: \n')
print(check_output(["ls", input_dir]).decode("utf8"))


File List: 

bitcoin_cash_price.csv
bitcoin_dataset.csv
bitcoin_price.csv
bitconnect_price.csv
dash_price.csv
ethereum_classic_price.csv
ethereum_dataset.csv
ethereum_price.csv
iota_price.csv
litecoin_price.csv
monero_price.csv
nem_price.csv
neo_price.csv
numeraire_price.csv
omisego_price.csv
qtum_price.csv
ripple_price.csv
stratis_price.csv
waves_price.csv


In [3]:
currencies = {}

currencies['bitcoin'] = pd.read_csv('{}/bitcoin_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['bitconnect'] = pd.read_csv('{}/bitconnect_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['dash'] = pd.read_csv('{}/dash_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['ethereum'] = pd.read_csv('{}/ethereum_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['iota'] = pd.read_csv('{}/iota_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['litecoin'] = pd.read_csv('{}/litecoin_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['monero'] = pd.read_csv('{}/monero_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['nem'] = pd.read_csv('{}/nem_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['neo'] = pd.read_csv('{}/neo_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['numeraire'] = pd.read_csv('{}/numeraire_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['omisego'] = pd.read_csv('{}/omisego_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['qtum'] = pd.read_csv('{}/qtum_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['ripple'] = pd.read_csv('{}/ripple_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['stratis'] = pd.read_csv('{}/stratis_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
currencies['waves'] = pd.read_csv('{}/waves_price.csv'.format(input_dir), parse_dates=['Date'], index_col=0)
# currencies['bitcoin_cash'] = pd.read_csv('{}/bitcoin_cash_price.csv'.format(input_dir))
# currencies['ethereum_data'] = pd.read_csv('{}/ethereum_dataset.csv'.format(input_dir))
# currencies['bitcoin_data'] = pd.read_csv('{}/bitcoin_dataset.csv'.format(input_dir))
# currencies['ethereum_classic'] = pd.read_csv('{}/ethereum_classic_price.csv'.format(input_dir))

print(len(currencies))


15

In [4]:
currencies['bitcoin'].head()


Out[4]:
Open High Low Close Volume Market Cap
Date
2017-09-05 4228.29 4427.84 3998.11 4376.53 2,697,970,000 69,954,400,000
2017-09-04 4591.63 4591.63 4108.40 4236.31 2,987,330,000 75,955,500,000
2017-09-03 4585.27 4714.08 4417.59 4582.96 1,933,190,000 75,841,700,000
2017-09-02 4901.42 4975.04 4469.24 4578.77 2,722,140,000 81,060,600,000
2017-09-01 4701.76 4892.01 4678.53 4892.01 2,599,080,000 77,748,400,000

In [ ]:
print('{} \n'.format(currencies['bitcoin_cash'].head()))
print('{} \n'.format(currencies['ethereum_data'].head()))
print('{} \n'.format(currencies['bitcoin_data'].head()))
print('{} \n'.format(currencies['ethereum_classic'].head()))

In [5]:
for c in currencies:
    print('====================={}============================'.format(c))
    print('Date of newest data: {}'.format(currencies[c].index[0]))
    print('Date of oldest data: {}\n'.format(currencies[c].index[-1]))


=====================bitcoin============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2013-04-28 00:00:00

=====================bitconnect============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2017-01-20 00:00:00

=====================dash============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2014-02-14 00:00:00

=====================ethereum============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2015-08-07 00:00:00

=====================iota============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2017-06-13 00:00:00

=====================litecoin============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2013-04-28 00:00:00

=====================monero============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2014-05-21 00:00:00

=====================nem============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2015-04-01 00:00:00

=====================neo============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2016-09-09 00:00:00

=====================numeraire============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2017-08-07 00:00:00

=====================omisego============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2017-08-07 00:00:00

=====================qtum============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2017-08-07 00:00:00

=====================ripple============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2013-08-04 00:00:00

=====================stratis============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2016-08-12 00:00:00

=====================waves============================
Date of newest data: 2017-09-05 00:00:00
Date of oldest data: 2016-06-02 00:00:00


In [6]:
# User select currency of interest for visualization
coin_type = 'bitcoin'
coin_feat = ['Open', 'Close']

currencies[coin_type].head()


Out[6]:
Open High Low Close Volume Market Cap
Date
2017-09-05 4228.29 4427.84 3998.11 4376.53 2,697,970,000 69,954,400,000
2017-09-04 4591.63 4591.63 4108.40 4236.31 2,987,330,000 75,955,500,000
2017-09-03 4585.27 4714.08 4417.59 4582.96 1,933,190,000 75,841,700,000
2017-09-02 4901.42 4975.04 4469.24 4578.77 2,722,140,000 81,060,600,000
2017-09-01 4701.76 4892.01 4678.53 4892.01 2,599,080,000 77,748,400,000

In [7]:
## Need to implement type validation checking mech

plt.plot(currencies[coin_type][coin_feat])
plt.legend(bbox_to_anchor=(1.01, 1))
plt.xlabel('Time(Yr-M)')
plt.ylabel('Value(USD)')
plt.title('{} Price - {}'.format(coin_feat, coin_type))
plt.show()



In [8]:
# Candlestick Graph Visualization

ohlc = currencies[coin_type][coin_feat].resample('10D').ohlc()
ohlc.reset_index(inplace=True)
ohlc['Date'] = ohlc['Date'].map(mdates.date2num)

fig, ax = plt.subplots()

candlestick_ohlc(ax, ohlc.values, width=2, colorup='g')
ax.xaxis_date()

plt.title('Candlestick Chart - {}'.format(coin_type))
plt.xlabel('Time(Yr-M)')
plt.ylabel('Value(USD)')
plt.legend()
plt.show()



In [9]:
ohlc = ['Open', 'High', 'Low', 'Close']

for feat in ohlc:
    plt.plot(currencies[coin_type][feat], label=feat)
# plt.plot(currencies[coin_type]['Close'], label=coin_type)
plt.legend(bbox_to_anchor=(1.01, 1))
plt.xlabel('Time(Yr-M)')
plt.ylabel('Value(USD)')
plt.show()



In [10]:
files_to_use = [
'bitcoin_price.csv',
'bitconnect_price.csv',
'dash_price.csv',
'ethereum_price.csv',
'iota_price.csv',
'litecoin_price.csv',
'monero_price.csv',
'nem_price.csv',
'neo_price.csv',
'numeraire_price.csv',
'omisego_price.csv',
'qtum_price.csv',
'ripple_price.csv',
'stratis_price.csv',
'waves_price.csv']


cols_to_use = []
for ind, file_name in enumerate(files_to_use):
    currency_name = file_name.split("_")[0]
    if ind == 0:
        df = pd.read_csv("../input/"+file_name, usecols=["Date", "Close"], parse_dates=["Date"])
        df.columns = ["Date", currency_name]
    else:
        temp_df = pd.read_csv("../input/"+file_name, usecols=["Date", "Close"], parse_dates=["Date"])
        temp_df.columns = ["Date", currency_name]
        df = pd.merge(df, temp_df, on="Date")
    cols_to_use.append(currency_name)
df.head()
        
temp_df = df[cols_to_use]
corrmat = temp_df.corr(method='spearman')
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corrmat, vmax=1., square=True)
plt.title("Cryptocurrency correlation map", fontsize=15)
plt.show()



In [11]:
# Unnormalized data
for c in currencies:
    plt.plot(currencies[c]['Close'], label=c)
plt.legend(bbox_to_anchor=(1.01, 1))
plt.xlabel('Time(Yr-M)')
plt.ylabel('Value(USD)')
plt.show()



In [12]:
for c in currencies:
    plt.plot(currencies[c]['Close'].iloc[:365], label=c)
plt.legend(bbox_to_anchor=(1.01, 1))
plt.xlabel('Time(Yr-M)')
plt.ylabel('Value(USD)')
plt.show()



In [ ]:


In [ ]: