In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import QuantileTransformer, Normalizer, LabelEncoder
import statsmodels.api as sm
%matplotlib inline
In [2]:
train = pd.read_csv('./../data/training.csv')
In [3]:
label = pd.read_csv('./../data/labels.csv', header=None)
In [4]:
train.head()
Out[4]:
In [5]:
train.drop('Unnamed: 0', axis=1, inplace=True)
In [6]:
train.columns
Out[6]:
In [7]:
train['type'].head()
Out[7]:
In [8]:
# label encode type
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
In [9]:
train['type_enc'].value_counts()
Out[9]:
In [10]:
label.columns = ['0', 'p_label2']
In [11]:
label.drop('0', axis=1, inplace=True)
In [12]:
train.dropna(inplace=True)
In [13]:
train.shape, label.shape
Out[13]:
In [14]:
y_label = np.ravel(label)
In [15]:
train.columns
Out[15]:
Note: there is an API that estimates the amount of gas used, thus we have access to gasUsed_t
In [16]:
train.drop(['type',
'mv',
'blockTime',
'difficulty',
'gasLimit_b',
'gasUsed_b',
'reward',
'size',
'totalFee',
'gasShare',
'gweiPaid',
'gweiPaid_b',
'gweiShare',
'txcnt_second'], axis=1, inplace=True)
In [17]:
train.columns
Out[17]:
In [18]:
train.drop(['free_t',
'newContract',
'amount_gwei',
'type_enc',
'dayofweek',
'day'], axis=1, inplace=True)
In [19]:
X = train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [20]:
X_train.shape, y_train.shape
Out[20]:
In [21]:
def rf_regressor(X_train, X_test, y_train, y_test):
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
# write predicted values to csv
p = pd.DataFrame({'y_pred': y_pred})
p.to_csv('./../data/label_pred.csv')
return rf
In [ ]:
model = rf_regressor(X_train, X_test, y_train, y_test)
In [ ]:
def plot_feature_importance(rf, feature_df):
cols = []
for col in feature_df.columns:
cols.append(col)
feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
index=cols)
feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
feat_scores.plot(kind='barh', color='r', figsize=(6,6))
#plt.xlabel('Importance', fontsize=18)
plt.title('Feature Importance', fontsize=18)
plt.tight_layout()
plt.savefig('./../images/feat_import_pruned.png', dpi=300)
In [ ]:
plot_feature_importance(model, train)
In [ ]:
y_pred = pd.read_csv('./../data/label_pred.csv')
In [ ]:
y_pred.drop('Unnamed: 0', axis=1, inplace=True)
In [ ]:
y_pred.head()
In [ ]:
y_test.shape
In [ ]:
y_pred = y_pred.values.ravel()
In [ ]:
y_test.shape, y_pred.shape
In [ ]:
result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
In [ ]:
result.head()
In [ ]:
plt.scatter(result['y_test'], result['y_pred'])
plt.xlim(0,100)
plt.ylim(0,100)
In [ ]:
sns.set(style="darkgrid", color_codes=True)
sns.jointplot(x="y_test", y="y_pred", data=result)
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('./../images/jointplot.png', dpi=300)
It looks like it is overfitting
In [ ]:
sns.residplot(result.y_test, result.y_pred , lowess=True, color="g")
In [66]:
def linear_regression(X_train, X_test, y_train, y_test):
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X_train, y_train, scoring='r2', cv=5)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
return lr
In [67]:
linear_regression(X_train, X_test, y_train, y_test)
Out[67]:
In [68]:
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()
Out[68]:
All features show to be significant
In [70]:
from numpy.linalg import matrix_rank
In [84]:
X_train.shape
Out[84]:
In [71]:
matrix_rank(X_train)
Out[71]:
It appears that our feature matrix is rank difficient and has a lot of colinearity
In [ ]: