In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from numpy.linalg import matrix_rank
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline
In [2]:
train = pd.read_csv('./../data/training.csv')
label = pd.read_csv('./../data/labels.csv', header=None)
train.drop('Unnamed: 0', axis=1, inplace=True)
# label encode type
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
label.columns = ['0', 'p_label2']
label.drop('0', axis=1, inplace=True)
y_label = np.ravel(label)
In [3]:
train.columns
Out[3]:
In [4]:
sub_cols = ['gasLimit_t',
'gasUsed_t',
'newContract',
'avg_blocktime_6',
'avg_uncle_count_6',
'avg_txcnt_second_6',
'avg_gasUsed_t_6',
'avg_price_6',
'avg_uncle_count_60',
'avg_price_60',
'mv',
'type_enc']
In [5]:
sub_train = train[sub_cols]
X = sub_train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [6]:
matrix_rank(X), len(sub_cols)
Out[6]:
If the VIF is equal to 1 there is no multicollinearity among factors, but if the VIF is greater than 1, the predictors may be moderately correlated. The output above shows that the VIF for the Publication and Years factors are about 1.5, which indicates some correlation, but not enough to be overly concerned about. A VIF between 5 and 10 indicates high correlation that may be problematic. And if the VIF goes above 10, you can assume that the regression coefficients are poorly estimated due to multicollinearity.
In [7]:
for i, col in enumerate(sub_train.columns):
print('VIF col {}: {}'.format(col,variance_inflation_factor(X,i)))
In [8]:
rf1 = RandomForestRegressor()
In [9]:
rf1.get_params().keys()
Out[9]:
In [10]:
# Create a scaler object
#sc = StandardScaler()
# Fit the scaler to the feature data and transform
#X_std = sc.fit_transform(X)
In [11]:
# Create a list of 10 candidate values for the C parameter
param_candidates = dict(max_depth=np.arange(1, 7, 1),
n_estimators=np.arange(5, 25, 5))
In [12]:
param_candidates
Out[12]:
In [13]:
# Create a gridsearch object with the decision tree regressor and the max_depth value candidates
rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_candidates)
In [14]:
# Fit the cross validated grid search on the data
rf.fit(X_train, y_train)
Out[14]:
In [17]:
# Show the best param values
rf.best_estimator_
Out[17]:
In [56]:
print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(
rf, X_test, y_test, scoring='r2', cv=5))))
In [43]:
results1 = pd.DataFrame(X_train)
results1.columns = sub_cols
results1['label'] = y_train
results1['pred'] = rf.predict(X_train)
results1['stage'] = 'Train'
In [44]:
results1.head()
Out[44]:
In [45]:
results2 = pd.DataFrame(X_test)
results2.columns = sub_cols
results2['label'] = y_test
results2['pred'] = rf.predict(X_test)
results2['stage'] = 'Test'
In [46]:
results2.head()
Out[46]:
In [51]:
results = pd.concat([results1,results2], axis=0)
In [52]:
results.head()
Out[52]:
In [54]:
results['logPred']=results.pred.apply(np.log10)
results['logLabel']=results.label.apply(np.log10)
In [72]:
sns.lmplot(x="label", y="pred", hue="stage",truncate=True, size=10, data=results)
Out[72]:
In [76]:
sns.jointplot(x="label", y="pred", size=10, data=results)
Out[76]:
In [55]:
sns.lmplot(x="logLabel", y="logPred", hue="stage",truncate=True, size=10, data=results)
Out[55]:
In [77]:
sns.jointplot(x="logLabel", y="logPred", size=10, data=results)
Out[77]:
In [ ]:
sns.jointplot(x="logLabel", y="logPred",kind="kde", size=10, data=results)
In [57]:
results['absoluteError'] = results.pred-results.label
results['relativeError'] = results.absoluteError/results.label
In [67]:
results.relativeError.hist(bins=np.arange(-1,4,.1))
Out[67]:
In [71]:
results.absoluteError.hist(bins=np.arange(-50,50,1))
Out[71]:
In [73]:
sub_train.columns
Out[73]:
In [65]:
y_pred = rf.predict(X_test)
In [68]:
y_pred.shape, y_test.shape
Out[68]:
In [80]:
results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
In [ ]:
results['y_test_log'] =
In [86]:
plt.scatter(results['y_test'].apply(np.log10), results['y_pred'].apply(np.log10))
#plt.xlim(0,100)
#plt.ylim(0,100)
Out[86]:
In [ ]:
sns.set(style="darkgrid", color_codes=True)
sns.jointplot(x="y_test", y="y_pred", data=results, marginal_kws=dict(bins=25)
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('./../images/jointplot.png', dpi=300)
In [91]:
def rf_regressor(X_train, X_test, y_train, y_test):
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
# write predicted values to csv
p = pd.DataFrame({'y_pred': y_pred})
p.to_csv('./../data/label_pred.csv')
return rf
In [92]:
model = rf_regressor(X_train, X_test, y_train, y_test)
In [ ]:
def plot_feature_importance(rf, feature_df):
cols = []
for col in feature_df.columns:
cols.append(col)
feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
index=cols)
feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
feat_scores.plot(kind='barh', color='r', figsize=(6,6))
#plt.xlabel('Importance', fontsize=18)
plt.title('Feature Importance', fontsize=18)
plt.tight_layout()
plt.savefig('./../images/feat_import_pruned.png', dpi=300)
In [93]:
plot_feature_importance(model, sub_train)
In [ ]: