In [1]:
import numpy as np
import pandas as pd
import models.imports.features

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

In [2]:
## load in features df.

trainsf_ = r'../data/text/bitcoin/train_set.csv'
features_df = models.imports.features.import_file(trainsf_)

In [3]:
features_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1754 entries, 0 to 1753
Data columns (total 6 columns):
date     1754 non-null object
chibs    1754 non-null float64
hm       1754 non-null float64
is       1754 non-null float64
lr       1754 non-null float64
price    1754 non-null float64
dtypes: float64(5), object(1)
memory usage: 95.9+ KB

In [4]:
## split into x, y

features_set_x, features_set_y = models.imports.features.create_dataset(features_df)

In [5]:
## x,

In [6]:
features_set_x.describe()


Out[6]:
chibs hm is lr
count 1754.000000 1754.000000 1754.000000 1754.000000
mean -56396.219603 -68234.384726 -77587.501197 -56291.299614
std 51940.257460 60791.875443 69651.745734 50620.714335
min -550446.937040 -608469.455649 -690089.181171 -509062.460009
25% -74211.183197 -90299.695851 -103232.827310 -74332.227083
50% -50000.412654 -60584.970105 -68693.517044 -50135.630238
75% -24743.757304 -31645.638258 -35440.577409 -25263.345517
max -664.394640 -727.634661 -606.808984 -532.879345

In [7]:
features_set_x.plot(alpha=.5, title='estimates')


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d240661c88>

In [8]:
features_set_x.plot.hist(bins=100, alpha=.5, title='estimate histograms')


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d2407ae278>

In [9]:
## y,

In [10]:
features_set_y.describe()


Out[10]:
price
count 1754.000000
mean 967.288005
std 2299.649478
min 4.270000
25% 109.132500
50% 375.050000
75% 654.950000
max 19343.040000

In [11]:
## train output,

In [12]:
plt.subplot(1,4,1)
plt.title('norm')
plt.plot(features_set_y)

plt.subplot(1,4,2)
plt.title('log')
plt.plot(np.log(features_set_y))

plt.subplot(1,4,3)
plt.title('scaled')
plt.plot(features_set_y / np.max(features_set_y))

plt.subplot(1,4,4)
plt.title('log + scaled')
transformed_y = np.log(features_set_y)
transformed_y /= np.max(transformed_y)
plt.plot(transformed_y)


plt.tight_layout()
plt.show()



In [13]:
features_set_y.plot.hist(bins=100, title='price histogram')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d240afb2b0>