In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy.linalg import matrix_rank
%matplotlib inline
In [2]:
train = pd.read_csv('./../data/training.csv')
label = pd.read_csv('./../data/labels.csv', header=None)
train.drop('Unnamed: 0', axis=1, inplace=True)
# label encode type
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
label.columns = ['0', 'p_label2']
label.drop('0', axis=1, inplace=True)
y_label = np.ravel(label)
In [3]:
train.drop(['type',
'mv',
'blockTime',
'difficulty',
'gasLimit_b',
'gasUsed_b',
'reward',
'size',
'totalFee',
'gasShare',
'gweiPaid',
'gweiPaid_b',
'gweiShare',
'txcnt_second'], axis=1, inplace=True)
In [4]:
train.columns
Out[4]:
In [5]:
sub_cols = [
'avg_blocktime_6',
'avg_blocktime_60',
'gasUsed_t',
'avg_gasUsed_b_6',
'avg_gasUsed_t_6',
'avg_tx_count_6',
'avg_uncle_count_6',
'avg_difficulty_6',
'avg_txcnt_second_6',
'avg_blocktime_60',
'avg_tx_count_60',
'avg_uncle_count_60',
'avg_difficulty_60',
'avg_txcnt_second_60',
'avg_gasUsed_t_60'
]
In [6]:
sub_train = train[sub_cols]
In [7]:
X = sub_train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [8]:
matrix_rank(X), len(sub_cols)
Out[8]:
In [15]:
def linear_regression(X_train, X_test, y_train, y_test):
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X, y, scoring='r2', cv=3)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
plt.scatter(y_test, y_pred)
return lr
In [16]:
linear_regression(X_train, X_test, y_train, y_test)
Out[16]:
In [11]:
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()
Out[11]:
In [20]:
# Create a scaler object
sc = StandardScaler()
# Fit the scaler to the feature data and transform
X_std = sc.fit_transform(X)
In [18]:
# Create a list of 10 candidate values for the C parameter
#max_depth_candidates = dict(max_depth=np.arange(1, 7, 1))
# Create a gridsearch object with the decision tree regressor and the max_depth value candidates
#reg = GridSearchCV(estimator=tree.DecisionTreeRegressor(), param_grid=max_depth_candidates)
In [19]:
print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(LinearRegression(), X_std, y, scoring='r2', cv=3))))
In [ ]: