notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import models.imports.features

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()



In [2]:

    
## load in features df.

trainsf_ = r'../data/text/bitcoin/train_set.csv'
features_df = models.imports.features.import_file(trainsf_)



In [3]:

    
features_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1754 entries, 0 to 1753
Data columns (total 6 columns):
date     1754 non-null object
chibs    1754 non-null float64
hm       1754 non-null float64
is       1754 non-null float64
lr       1754 non-null float64
price    1754 non-null float64
dtypes: float64(5), object(1)
memory usage: 95.9+ KB



In [4]:

    
## split into x, y

features_set_x, features_set_y = models.imports.features.create_dataset(features_df)



In [5]:

    
## x,



In [6]:

    
features_set_x.describe()









    Out[6]:







  
    
      
      chibs
      hm
      is
      lr
    
  
  
    
      count
      1754.000000
      1754.000000
      1754.000000
      1754.000000
    
    
      mean
      -56396.219603
      -68234.384726
      -77587.501197
      -56291.299614
    
    
      std
      51940.257460
      60791.875443
      69651.745734
      50620.714335
    
    
      min
      -550446.937040
      -608469.455649
      -690089.181171
      -509062.460009
    
    
      25%
      -74211.183197
      -90299.695851
      -103232.827310
      -74332.227083
    
    
      50%
      -50000.412654
      -60584.970105
      -68693.517044
      -50135.630238
    
    
      75%
      -24743.757304
      -31645.638258
      -35440.577409
      -25263.345517
    
    
      max
      -664.394640
      -727.634661
      -606.808984
      -532.879345



In [7]:

    
features_set_x.plot(alpha=.5, title='estimates')









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x1d240661c88>



In [8]:

    
features_set_x.plot.hist(bins=100, alpha=.5, title='estimate histograms')









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x1d2407ae278>



In [9]:

    
## y,



In [10]:

    
features_set_y.describe()









    Out[10]:







  
    
      
      price
    
  
  
    
      count
      1754.000000
    
    
      mean
      967.288005
    
    
      std
      2299.649478
    
    
      min
      4.270000
    
    
      25%
      109.132500
    
    
      50%
      375.050000
    
    
      75%
      654.950000
    
    
      max
      19343.040000



In [11]:

    
## train output,



In [12]:

    
plt.subplot(1,4,1)
plt.title('norm')
plt.plot(features_set_y)

plt.subplot(1,4,2)
plt.title('log')
plt.plot(np.log(features_set_y))

plt.subplot(1,4,3)
plt.title('scaled')
plt.plot(features_set_y / np.max(features_set_y))

plt.subplot(1,4,4)
plt.title('log + scaled')
transformed_y = np.log(features_set_y)
transformed_y /= np.max(transformed_y)
plt.plot(transformed_y)


plt.tight_layout()
plt.show()



In [13]:

    
features_set_y.plot.hist(bins=100, title='price histogram')









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x1d240afb2b0>

	chibs	hm	is	lr
count	1754.000000	1754.000000	1754.000000	1754.000000
mean	-56396.219603	-68234.384726	-77587.501197	-56291.299614
std	51940.257460	60791.875443	69651.745734	50620.714335
min	-550446.937040	-608469.455649	-690089.181171	-509062.460009
25%	-74211.183197	-90299.695851	-103232.827310	-74332.227083
50%	-50000.412654	-60584.970105	-68693.517044	-50135.630238
75%	-24743.757304	-31645.638258	-35440.577409	-25263.345517
max	-664.394640	-727.634661	-606.808984	-532.879345