In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import QuantileTransformer, Normalizer
import statsmodels.api as sm
%matplotlib inline
In [2]:
num_prev_blocks = 50
In [3]:
df = pd.read_csv('./../data/data_filtered.csv')
In [4]:
df_avg = pd.read_csv('./../data/block_avg_{}.csv'.format(num_prev_blocks))
In [5]:
np.unique(df['block_id'].values).shape
Out[5]:
In [6]:
df.columns
Out[6]:
In [7]:
df_avg.columns
Out[7]:
In [8]:
df.drop('Unnamed: 0', axis=1, inplace=True)
In [9]:
df_avg.drop('Unnamed: 0', axis=1, inplace=True)
In [10]:
df_avg.shape
Out[10]:
In [11]:
df.shape[1] + df_avg.shape[1]
Out[11]:
In [12]:
df_avg.head()
Out[12]:
In [13]:
merged = pd.merge(df, df_avg, left_on='block_id', right_on='blockids')
In [14]:
merged.columns
Out[14]:
In [15]:
merged.shape
Out[15]:
In [16]:
#find null values
for col in merged.columns:
print(col, merged[col].isnull().sum())
In [17]:
merged.drop('txIndex', axis=1, inplace=True)
In [18]:
merged.dropna(inplace=True)
In [19]:
#find null values
for col in merged.columns:
print(col, merged[col].isnull().sum())
In [20]:
merged['price_gwei'].hist(bins=2000)
plt.xlim(0,100)
Out[20]:
In [21]:
np.log(merged['price_gwei'].values)
Out[21]:
In [22]:
plt.scatter(np.log(merged['amount_eth'].values), np.log(merged['price_gwei'].values))
Out[22]:
In [23]:
merged['avg_blocktime'].hist()
Out[23]:
In [24]:
plt.scatter(merged['avg_blocktime'], merged['price_gwei'])
Out[24]:
In [25]:
features = [
'newContract',
'day',
'hour',
'dayofweek',
'amount_eth',
'type_enc',
'avg_price',
'avg_blocktime',
'avg_gasUsed_b',
'avg_tx_count',
'avg_uncle_count',
'avg_difficulty',
'avg_txcnt_second',
'avg_gasUsed_t'
]
X = merged[features].values
y = merged['price_gwei'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [26]:
#sklearn quantile transformer
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)
In [27]:
#normalize
normalizer = Normalizer().fit(X_train)
X_train_norm = normalizer.transform(X_train)
normalizer = Normalizer().fit(X_test)
X_test_norm = normalizer.transform(X_test)
In [28]:
def linear_regression(X_train, X_test, y_train, y_test):
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X_train, y_train, scoring='r2', cv=5)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
return lr
In [29]:
linear_regression(X_train, X_test, y_train, y_test)
Out[29]:
In [30]:
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()
Out[30]:
In [31]:
for num, col in enumerate(merged[features].columns):
print(num+1, col)
In [32]:
mse=[9906, 5689.59, 3902.42, 4946.9, 6474.36, 9032.47, 16197.49]
num_prev = [10, 25, 50, 100, 2000, 3000, 4000]
results = pd.DataFrame({'num_prev_blocks': num_prev, 'mse': mse})
sns.pointplot(x="num_prev_blocks", y="mse", data=results, color='r')
plt.title('Error with respect to number of previous blocks used')
plt.savefig('./../images/mse_prev_blocks.png')
The MSE seems to be minimal when using 50 previous blocks
In [33]:
def knn_regressor(X_train, X_test, y_train, y_test):
model = KNeighborsRegressor(n_neighbors=5, metric='cosine', weights='uniform')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('CV score: {} | MSE: {} | R^2: {}'.format(np.mean(cross_val_score(model, X_train, y_train)),
mean_squared_error(y_test, y_pred),
r2_score(y_test, y_pred)))
In [34]:
def rf_regressor(X_train, X_test, y_train, y_test):
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('R2_score: {}'.format(r2_score(y_test, y_pred)))
print('avg_CV_score: {}'.format(np.mean(scores)))
return rf
In [35]:
model = rf_regressor(X_train, X_test, y_train, y_test)
In [36]:
def plot_feature_importance(rf, feature_df):
cols = []
for col in feature_df.columns:
cols.append(col)
feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
index=cols)
feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
feat_scores.plot(kind='barh', color='r', figsize=(6,6))
plt.xlabel('Importance', fontsize=18)
plt.title('Feature Importance', fontsize=18)
plt.savefig('./../images/feat_import_50.png')
In [37]:
plot_feature_importance(model, merged[features])
In [ ]:
In [ ]: