In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy.linalg import matrix_rank
%matplotlib inline


//anaconda/lib/python2.7/site-packages/pandas/core/computation/__init__.py:18: UserWarning: The installed version of numexpr 2.4.3 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)
//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
//anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
train = pd.read_csv('./../data/training.csv')
label = pd.read_csv('./../data/labels.csv', header=None)
train.drop('Unnamed: 0', axis=1, inplace=True)
# label encode type 
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
label.columns = ['0', 'p_label2']
label.drop('0', axis=1, inplace=True)
y_label = np.ravel(label)

In [3]:
train.drop(['type', 
            'mv', 
            'blockTime', 
            'difficulty', 
            'gasLimit_b', 
            'gasUsed_b',
            'reward',
            'size',
            'totalFee',
            'gasShare',
            'gweiPaid',
            'gweiPaid_b',
            'gweiShare',
            'txcnt_second'], axis=1, inplace=True)

Subset of features


In [4]:
train.columns


Out[4]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'amount_gwei', u'free_t',
       u'day', u'hour', u'dayofweek', u'avg_blocktime_6', u'avg_gasUsed_b_6',
       u'avg_tx_count_6', u'avg_uncle_count_6', u'avg_difficulty_6',
       u'avg_txcnt_second_6', u'avg_gasUsed_t_6', u'avg_price_6',
       u'avg_blocktime_60', u'avg_gasUsed_b_60', u'avg_tx_count_60',
       u'avg_uncle_count_60', u'avg_difficulty_60', u'avg_txcnt_second_60',
       u'avg_gasUsed_t_60', u'avg_price_60', u'type_enc'],
      dtype='object')

In [5]:
sub_cols = [
           'avg_blocktime_6',
           'avg_blocktime_60',
            'gasUsed_t',
    'avg_gasUsed_b_6',
    'avg_gasUsed_t_6',
    'avg_tx_count_6',
    'avg_uncle_count_6',
    'avg_difficulty_6',
    'avg_txcnt_second_6',
    'avg_blocktime_60',
    'avg_tx_count_60',
    'avg_uncle_count_60',
    'avg_difficulty_60',
    'avg_txcnt_second_60',
    'avg_gasUsed_t_60'
         ]

In [6]:
sub_train = train[sub_cols]

Train test split


In [7]:
X = sub_train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
matrix_rank(X), len(sub_cols)


Out[8]:
(3, 15)

Linear regression


In [15]:
def linear_regression(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    scores = cross_val_score(lr, X, y, scoring='r2', cv=3)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    plt.scatter(y_test, y_pred)
    return lr

In [16]:
linear_regression(X_train, X_test, y_train, y_test)


MSE: 868.175005805
R2_score: 0.0670993034619
avg_CV_score: -8.58833729575
Out[16]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()


Out[11]:
OLS Regression Results
Dep. Variable: y R-squared: 0.073
Model: OLS Adj. R-squared: 0.073
Method: Least Squares F-statistic: 1.174e+04
Date: Wed, 04 Oct 2017 Prob (F-statistic): 0.00
Time: 00:59:25 Log-Likelihood: -5.0691e+06
No. Observations: 1047469 AIC: 1.014e+07
Df Residuals: 1047461 BIC: 1.014e+07
Df Model: 7
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
x1 0.1751 0.004 47.259 0.000 0.168 0.182
x2 -0.4256 0.004 -96.551 0.000 -0.434 -0.417
x3 -6.341e-07 1.99e-07 -3.187 0.001 -1.02e-06 -2.44e-07
x4 3.911e-07 3.43e-08 11.397 0.000 3.24e-07 4.58e-07
x5 3.037e-06 6.78e-07 4.476 0.000 1.71e-06 4.37e-06
x6 -0.0121 0.001 -8.523 0.000 -0.015 -0.009
x7 -2.294e-05 1.98e-06 -11.591 0.000 -2.68e-05 -1.91e-05
x8 1.104e-14 4.28e-17 257.806 0.000 1.1e-14 1.11e-14
x9 -0.1209 0.005 -23.824 0.000 -0.131 -0.111
x10 -0.4433 0.005 -96.840 0.000 -0.452 -0.434
x11 0.1150 0.002 71.998 0.000 0.112 0.118
x12 0.0002 1.95e-06 88.583 0.000 0.000 0.000
x13 -4.314e-15 8.56e-17 -50.398 0.000 -4.48e-15 -4.15e-15
x14 0.1179 0.001 80.176 0.000 0.115 0.121
x15 7.573e-05 1.59e-06 47.626 0.000 7.26e-05 7.88e-05
Omnibus: 2871297.408 Durbin-Watson: 2.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 142742364310.187
Skew: 34.570 Prob(JB): 0.00
Kurtosis: 1810.149 Cond. No. 1.32e+22

Cross validation


In [20]:
# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the feature data and transform
X_std = sc.fit_transform(X)

In [18]:
# Create a list of 10 candidate values for the C parameter
#max_depth_candidates = dict(max_depth=np.arange(1, 7, 1))

# Create a gridsearch object with the decision tree regressor and the max_depth value candidates
#reg = GridSearchCV(estimator=tree.DecisionTreeRegressor(), param_grid=max_depth_candidates)

In [19]:
print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(LinearRegression(), X_std, y, scoring='r2', cv=3))))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-d8120561a386> in <module>()
----> 1 print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(LinearRegression(), X_std, y, scoring='r2', cv=3))))

//anaconda/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    319                                 n_jobs=n_jobs, verbose=verbose,
    320                                 fit_params=fit_params,
--> 321                                 pre_dispatch=pre_dispatch)
    322     return cv_results['test_score']
    323 

//anaconda/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
    179 
    180     """
--> 181     X, y, groups = indexable(X, y, groups)
    182 
    183     cv = check_cv(cv, y, classifier=is_classifier(estimator))

//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in indexable(*iterables)
    196         else:
    197             result.append(np.array(X))
--> 198     check_consistent_length(*result)
    199     return result
    200 

//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    171     if len(uniques) > 1:
    172         raise ValueError("Found input variables with inconsistent numbers of"
--> 173                          " samples: %r" % [int(l) for l in lengths])
    174 
    175 

ValueError: Found input variables with inconsistent numbers of samples: [1047469, 1396626]

In [ ]: