In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_set_path = r'C:\Users\dmpas\thesis\data\network\bitcoin\estimates\train_set.csv'
test_set_path = r'C:\Users\dmpas\thesis\data\network\bitcoin\estimates\test_set.csv'

In [3]:
def ma(price):
    if price < -100:
        return 5
    if price < -50:
        return 4
    if price < 0:
        return 3
    if price < 50:
        return 2
    return 1

In [4]:
train_df = pd.read_csv(train_set_path, index_col=0)
train_df.columns = ['Author', 'bc', 'cc', 'chibs', 'dc', 'ec', 'hm', 'is', 'lr', 'price']
train_df['price_category'] = train_df.price.map(ma)

print(len(train_df))


356557

In [5]:
train_df.head()


Out[5]:
Author bc cc chibs dc ec hm is lr price price_category
0 177412 0.634528 1.239146 -18.257680 1.438233 0.372907 -1325.966306 -22076.378149 -27003.578634 6.9 2
1 53511 0.262976 1.225784 -101.154828 1.062802 0.305459 -498.691400 -7557.808217 -7461.446531 6.9 2
2 156757 0.091405 1.200112 -3.394149 0.757074 0.209725 -237.573484 -3396.268925 -4125.309848 6.9 2
3 100398 0.098763 1.190790 -26.896536 0.670117 0.167796 -252.269969 -3612.483019 -3672.512010 6.9 2
4 166687 0.065896 1.192045 -8.781838 0.503796 0.157702 -117.802576 -1858.142623 -1775.790037 6.9 2

In [6]:
test_df = pd.read_csv(test_set_path, index_col=0)
test_df.columns = ['Author', 'bc', 'cc', 'chibs', 'dc', 'ec', 'hm', 'is', 'lr', 'price']
test_df['price_category'] = train_df.price.map(ma)

print(len(test_df))


88521

In [7]:
test_df.head()


Out[7]:
Author bc cc chibs dc ec hm is lr price price_category
0 99508 0.022297 1.169596 -30.870187 0.502415 0.150627 -222.525649 -2995.403068 -2606.443962 6.9 2
1 177268 0.016294 1.159342 -9.926871 0.429262 0.128385 -73.944996 -1053.014603 -954.976211 6.9 2
2 50584 0.017190 1.161668 16.082362 0.324362 0.094380 -84.019078 -1102.406890 -1055.546625 6.9 2
3 153645 0.008935 1.117261 -12.029111 0.314010 0.091849 -86.938191 -981.997684 -975.776543 6.9 2
4 250893 0.004391 1.115628 -20.254224 0.287785 0.085615 -34.020792 -442.435106 -418.764624 6.9 2

In [8]:
from sklearn import preprocessing

In [9]:
train_set_X = preprocessing.scale(train_df.loc[:, ['bc', 'cc', 'chibs', 'dc', 'ec', 'hm', 'is', 'lr']])
train_set_Y = train_df.loc[:, ['price_category']].values.ravel()

In [10]:
test_set_X = preprocessing.scale(test_df.loc[:, ['bc', 'cc', 'chibs', 'dc', 'ec', 'hm', 'is', 'lr']])
test_set_Y = test_df.loc[:, ['price_category']].values.ravel()

In [11]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

In [12]:
n_estimators = 8

In [13]:
svc1 = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear'), max_samples=1.0/50, n_estimators=n_estimators))
svc1.fit(train_set_X, train_set_Y)

y_pred_1 = svc1.predict(test_set_X)
print(confusion_matrix(test_set_Y, y_pred_1))


[[ 7885 18471   371  4269   144]
 [ 4817 12209   291  2016   118]
 [ 2485  7860    77  1930    34]
 [    0     0     0     0     0]
 [ 6564 14908   305  3663   104]]

In [ ]:


In [14]:
svc2 = OneVsRestClassifier(BaggingClassifier(SVC(kernel='poly', degree=4), max_samples=1.0/50, n_estimators=n_estimators))
svc2.fit(train_set_X, train_set_Y)
y_pred_2 = svc2.predict(test_set_X)

print(confusion_matrix(test_set_Y, y_pred_2))


[[18453  3784  2822  2297  3784]
 [11236  2629  1793  1355  2438]
 [ 7944  1293   879  1028  1242]
 [    0     0     0     0     0]
 [16216  2626  1907  1833  2962]]

In [ ]:


In [15]:
svc4 = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf'), max_samples=1.0/50, n_estimators=n_estimators))
svc4.fit(train_set_X, train_set_Y)
y_pred_4 = svc4.predict(test_set_X)

print(confusion_matrix(test_set_Y, y_pred_4))


[[14610  5656  2322  2365  6187]
 [ 9854  3647  1438  1193  3319]
 [ 5580  1684   965  1338  2819]
 [    0     0     0     0     0]
 [12206  3547  1881  2302  5608]]

In [ ]:


In [16]:
svc5 = OneVsRestClassifier(BaggingClassifier(SVC(kernel='sigmoid'), max_samples=1.0/50, n_estimators=n_estimators))
svc5.fit(train_set_X, train_set_Y)
y_pred_5 = svc5.predict(test_set_X)

print(confusion_matrix(test_set_Y, y_pred_5))


[[26216  1988     0  2827   109]
 [16901  1239     0  1263    48]
 [10551   766     0  1018    51]
 [    0     0     0     0     0]
 [21397  1648     0  2406    93]]

In [ ]:


In [17]:
rfc = RandomForestClassifier(min_samples_leaf=20)
rfc.fit(train_set_X, train_set_Y)

y_pred_6 = rfc.predict(test_set_X)
print(confusion_matrix(test_set_Y, y_pred_6))


[[13250  7652  5042  2703  2493]
 [ 9090  5503  1906  2203   749]
 [ 5503  1850  1091  2625  1317]
 [    0     0     0     0     0]
 [15069  4226  1480  2500  2269]]

In [ ]:


In [ ]: