In [19]:
import pandas as pd
import numpy as np
import re
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv("cleaned_AllCards.csv")
data = data[data.columns[1:]]
data.head()


Out[19]:
loyalty power starter toughness white black red green blue redCost ... type_Enchantment type_Instant type_Planeswalker type_Scariest type_You'll type_Ever type_See type_Sorcery type_Tribal isLegendary
0 0 0 0 0 0 0 1 1 0 2 ... 1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
3 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
4 0 3 0 2 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 447 columns


In [20]:
X_train,X_test,y_train,y_test = train_test_split(data[data.columns[0:-1]],data['isLegendary'],test_size=0.2)

clf = RandomForestClassifier(n_jobs=-1,n_estimators=150)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print confusion_matrix(y_test,y_pred)


[[2857   19]
 [ 105   23]]

In [21]:
clust = DBSCAN(metric='cityblock')
dbscan_labels = clust.fit_predict(data)
np.unique(dbscan_labels,return_counts=True)


Out[21]:
(array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
         38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
         51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
         64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
         77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
         90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
        142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
        181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
        194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,
        207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
        220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
        233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245,
        246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258,
        259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271,
        272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
        285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297,
        298, 299, 300, 301, 302, 303, 304, 305, 306, 307]),
 array([8731,   74,    6,   11,    6,   12,  112,  103,   13,   35,   76,
          65,   28,   23,   35,    5,  224,   30,   78,    5,   42,    5,
         111,   36,  164,   45,   43,   17,   15,   19,    9,    7,    6,
          50,   44,   19,  194,   55,   76,   62,   39,   39,   18,    6,
          51,   22,   29,   74,   41,   18,   35,    6,   57,   91,   12,
           6,    6,   22,    9,   14,   65,    5,   12,  101,   13,    8,
          16,   17,   16,   13,    5,    9,   28,   35,    5,    5,    8,
          40,   21,   33,   22,   26,   44,   17,    5,   38,   27,    9,
           8,   40,   25,   27,   35,   42,   25,    8,    8,   10,   47,
           8,   38,   17,   13,   50,   51,   13,   13,   27,    5,   14,
          45,    5,    9,   27,   51,   43,    6,   50,    5,   54,   69,
          17,   13,   65,   13,   32,    9,   32,   30,   24,    8,    6,
          25,   31,   22,   55,   40,   18,   21,    7,   11,    9,    7,
           7,    7,    6,   22,   22,   12,    9,   20,    5,    9,    5,
           6,   11,   10,    6,    5,    6,    5,   20,   11,   11,    8,
          33,   14,    5,   13,   13,   20,    5,   14,    7,    9,   20,
           8,    6,    6,   31,    5,   12,   15,    9,   19,    5,   14,
          16,    9,    6,   30,    7,   28,    9,    6,    6,   29,   17,
           7,    6,   23,   10,    7,   23,   11,    9,   24,   14,   12,
          29,   14,    5,   47,   18,    5,    5,    9,    5,   42,   12,
           6,   11,    6,   15,    6,   11,    5,    5,    5,    6,    6,
           9,   12,   20,    7,    7,    5,    6,   11,    5,   14,    5,
           9,   11,    6,   11,    7,    5,    6,    5,    7,   10,    5,
          27,    6,    9,   13,    6,    5,    5,    5,   18,   13,    9,
           5,    7,    7,    5,    9,    5,    6,    5,    8,    9,    5,
          11,    7,    6,   12,    6,    8,   26,    5,    9,    6,    6,
          11,    7,    6,    7,    5,    5,    7,    7,   14,    6,    5,
           5,    6,    8,    5,    5,    5,    5,    6,    5,    5,    6,
           5]))

In [22]:
clust = KMeans(n_clusters=3)
kmeans_labels = clust.fit_predict(data)
np.unique(kmeans_labels,return_counts=True)


Out[22]:
(array([0, 1, 2], dtype=int32), array([5716, 7119, 2183]))

In [ ]: