In [27]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from qsprLib import *

pd.options.display.mpl_style = 'default'
%matplotlib inline

data_id = 21
Xtrain = pd.read_csv('/home/loschen/Desktop/datamining-kaggle/numerai/data/numerai_datasets_'+str(data_id)+'/numerai_training_data.csv')
ytrain = Xtrain['target']
Xtrain.drop(['target'],axis=1,inplace=True)
Xtest =  pd.read_csv('/home/loschen/Desktop/datamining-kaggle/numerai/data/numerai_datasets_'+str(data_id)+'/numerai_tournament_data.csv')
Xtest.drop(['t_id'],axis=1,inplace=True)
Xtrain['test'] = 0
Xtest['test'] = 1
Xall = pd.concat([Xtest, Xtrain], ignore_index=True)
print Xtrain.shape
print Xtest.shape
y = Xall['test']
Xall.drop(['test'],axis=1,inplace=True)
Xtrain.drop(['test'],axis=1,inplace=True)
print Xall.head(5)


(96320, 22)
(36069, 22)
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0  0.998597  0.962135  0.474424  0.397017  0.747835  0.490444  0.348610   
1  0.640561  0.535377  0.747906  0.058272  0.319811  0.479179  0.715161   
2  0.806886  0.535228  0.233341  0.869270  0.642157  0.380417  0.645031   
3  0.723529  0.511598  0.491868  0.086773  0.233891  0.639297  0.268560   
4  0.981611  0.715084  0.515943  0.330296  0.145273  0.434277  0.946169   

   feature8  feature9  feature10    ...      feature12  feature13  feature14  \
0  0.927024  0.754964   0.601444    ...       0.812625   0.294137   0.634636   
1  0.274235  0.008252   0.002401    ...       0.799245   0.253041   0.729576   
2  0.689936  0.831643   0.912811    ...       0.750655   0.126558   0.263562   
3  0.507851  0.087346   0.055295    ...       0.648740   0.233070   0.382944   
4  0.104733  0.025793   0.124876    ...       0.974505   0.039401   0.702375   

   feature15  feature16  feature17  feature18  feature19  feature20  feature21  
0   0.879167   0.054591   0.587397   0.977265   0.237652   0.685975   0.412006  
1   0.273463   0.977826   0.415618   0.932069   0.309055   0.935144   0.942583  
2   0.534873   0.568492   0.546145   0.687379   0.150463   0.727110   0.553446  
3   0.277411   0.002835   0.294322   0.569867   0.360368   0.022967   0.070465  
4   0.494245   0.301699   0.899162   0.860829   0.488071   0.421172   0.330065  

[5 rows x 21 columns]

In [3]:
Xtrain.columns


Out[3]:
Index([u'feature1', u'feature2', u'feature3', u'feature4', u'feature5',
       u'feature6', u'feature7', u'feature8', u'feature9', u'feature10',
       u'feature11', u'feature12', u'feature13', u'feature14', u'feature15',
       u'feature16', u'feature17', u'feature18', u'feature19', u'feature20',
       u'feature21'],
      dtype='object')

In [ ]:
from sklearn.cluster import DBSCAN
db = DBSCAN().fit(Xtrain)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
print "In core:",db.core_sample_indices_.shape
labels = db.labels_
noise_labels = labels == -1
print "Noise:",noise_labels.sum()

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "cluster:",n_clusters_
plt.hist(labels)

In [28]:
Xtrain['clusters'] = labels
grouped = Xtrain.groupby('clusters')

In [29]:
grouped.groups


Out[29]:
{-1: [120,
  1033,
  1744,
  5634,
  6039,
  6703,
  7155,
  9046,
  9136,
  12486,
  14569,
  20063,
  21931,
  23404,
  28355,
  29674,
  30505,
  34432,
  39299,
  40132,
  46504,
  49759,
  55998,
  57622,
  59973,
  60084,
  60866,
  69818,
  72205,
  72572,
  72739,
  74104,
  76488,
  78302,
  81827,
  84438,
  88814,
  93452],
 0: [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  165,
  166,
  167,
  168,
  169,
  170,
  171,
  172,
  173,
  174,
  175,
  176,
  177,
  178,
  179,
  180,
  181,
  182,
  183,
  184,
  185,
  186,
  187,
  188,
  189,
  190,
  191,
  192,
  193,
  194,
  195,
  196,
  197,
  198,
  199,
  200,
  201,
  202,
  203,
  204,
  205,
  206,
  207,
  208,
  209,
  210,
  211,
  212,
  213,
  214,
  215,
  216,
  217,
  218,
  219,
  220,
  221,
  222,
  223,
  224,
  225,
  226,
  227,
  228,
  229,
  230,
  231,
  232,
  233,
  234,
  235,
  236,
  237,
  238,
  239,
  240,
  241,
  242,
  243,
  244,
  245,
  246,
  247,
  248,
  249,
  250,
  251,
  252,
  253,
  254,
  255,
  256,
  257,
  258,
  259,
  260,
  261,
  262,
  263,
  264,
  265,
  266,
  267,
  268,
  269,
  270,
  271,
  272,
  273,
  274,
  275,
  276,
  277,
  278,
  280,
  281,
  282,
  283,
  284,
  285,
  286,
  287,
  288,
  289,
  290,
  291,
  292,
  293,
  294,
  295,
  296,
  297,
  298,
  299,
  300,
  301,
  302,
  303,
  304,
  305,
  306,
  307,
  308,
  309,
  310,
  311,
  312,
  313,
  314,
  315,
  316,
  317,
  318,
  319,
  320,
  321,
  322,
  323,
  324,
  325,
  326,
  328,
  329,
  330,
  331,
  332,
  333,
  334,
  335,
  336,
  337,
  338,
  339,
  340,
  341,
  342,
  343,
  344,
  345,
  346,
  347,
  348,
  349,
  350,
  351,
  352,
  353,
  354,
  355,
  356,
  357,
  358,
  359,
  360,
  361,
  362,
  363,
  364,
  365,
  366,
  367,
  368,
  369,
  370,
  371,
  372,
  373,
  374,
  375,
  376,
  377,
  378,
  379,
  380,
  381,
  382,
  383,
  384,
  385,
  386,
  387,
  388,
  389,
  390,
  391,
  392,
  393,
  394,
  395,
  396,
  397,
  398,
  399,
  400,
  401,
  402,
  403,
  404,
  405,
  406,
  407,
  408,
  409,
  410,
  411,
  412,
  413,
  414,
  415,
  416,
  417,
  418,
  419,
  420,
  421,
  422,
  423,
  424,
  425,
  426,
  427,
  428,
  429,
  430,
  431,
  432,
  433,
  434,
  435,
  436,
  437,
  438,
  439,
  440,
  441,
  442,
  443,
  444,
  445,
  446,
  447,
  448,
  449,
  450,
  451,
  452,
  453,
  454,
  455,
  456,
  457,
  458,
  459,
  460,
  461,
  462,
  463,
  464,
  465,
  466,
  468,
  469,
  470,
  471,
  472,
  473,
  474,
  475,
  476,
  477,
  478,
  479,
  480,
  481,
  482,
  483,
  484,
  485,
  486,
  487,
  488,
  489,
  490,
  491,
  492,
  493,
  494,
  495,
  496,
  498,
  499,
  500,
  501,
  502,
  503,
  504,
  505,
  506,
  507,
  508,
  509,
  510,
  511,
  512,
  513,
  514,
  515,
  516,
  517,
  518,
  519,
  520,
  521,
  522,
  523,
  524,
  525,
  526,
  527,
  528,
  529,
  530,
  531,
  532,
  533,
  534,
  535,
  536,
  537,
  538,
  539,
  540,
  541,
  542,
  543,
  544,
  545,
  546,
  547,
  548,
  549,
  550,
  551,
  552,
  553,
  554,
  555,
  556,
  557,
  558,
  559,
  560,
  561,
  562,
  563,
  564,
  565,
  566,
  567,
  568,
  569,
  570,
  571,
  572,
  573,
  574,
  575,
  576,
  577,
  578,
  579,
  580,
  581,
  582,
  583,
  584,
  585,
  586,
  587,
  588,
  589,
  590,
  591,
  592,
  593,
  594,
  595,
  596,
  597,
  598,
  599,
  600,
  601,
  602,
  603,
  604,
  605,
  606,
  607,
  608,
  609,
  610,
  611,
  612,
  613,
  614,
  615,
  616,
  617,
  618,
  619,
  620,
  621,
  622,
  623,
  624,
  625,
  626,
  627,
  628,
  629,
  630,
  631,
  632,
  633,
  634,
  635,
  636,
  637,
  638,
  639,
  640,
  641,
  642,
  643,
  644,
  645,
  646,
  647,
  648,
  649,
  650,
  651,
  652,
  653,
  654,
  655,
  656,
  657,
  658,
  659,
  660,
  661,
  662,
  663,
  664,
  665,
  666,
  667,
  669,
  670,
  671,
  672,
  673,
  674,
  675,
  676,
  677,
  678,
  679,
  680,
  681,
  682,
  683,
  684,
  685,
  687,
  688,
  689,
  690,
  691,
  692,
  693,
  694,
  695,
  696,
  697,
  698,
  699,
  700,
  701,
  702,
  703,
  704,
  705,
  706,
  707,
  708,
  709,
  710,
  711,
  712,
  713,
  714,
  715,
  716,
  717,
  718,
  719,
  720,
  721,
  722,
  723,
  724,
  725,
  726,
  727,
  728,
  729,
  730,
  731,
  732,
  733,
  734,
  735,
  736,
  737,
  738,
  739,
  740,
  741,
  742,
  743,
  744,
  745,
  746,
  747,
  748,
  749,
  750,
  751,
  752,
  753,
  754,
  755,
  756,
  757,
  758,
  759,
  760,
  761,
  762,
  763,
  764,
  765,
  766,
  767,
  768,
  769,
  770,
  771,
  772,
  773,
  774,
  775,
  776,
  777,
  779,
  780,
  781,
  782,
  783,
  784,
  785,
  786,
  787,
  788,
  789,
  790,
  791,
  792,
  793,
  794,
  795,
  796,
  797,
  798,
  799,
  800,
  801,
  802,
  803,
  804,
  805,
  806,
  807,
  808,
  809,
  810,
  811,
  812,
  813,
  814,
  815,
  816,
  817,
  818,
  819,
  820,
  821,
  822,
  823,
  824,
  825,
  826,
  827,
  828,
  829,
  830,
  831,
  832,
  833,
  834,
  835,
  836,
  837,
  838,
  839,
  840,
  841,
  842,
  843,
  844,
  845,
  846,
  847,
  848,
  849,
  850,
  851,
  852,
  853,
  854,
  855,
  856,
  857,
  858,
  859,
  860,
  861,
  862,
  863,
  864,
  865,
  866,
  867,
  868,
  869,
  870,
  871,
  872,
  873,
  874,
  875,
  876,
  877,
  878,
  879,
  880,
  881,
  882,
  883,
  884,
  885,
  886,
  887,
  888,
  889,
  890,
  891,
  892,
  893,
  894,
  895,
  896,
  897,
  898,
  899,
  900,
  901,
  902,
  903,
  904,
  905,
  906,
  907,
  908,
  909,
  910,
  911,
  912,
  913,
  914,
  915,
  916,
  917,
  918,
  919,
  921,
  922,
  923,
  924,
  925,
  926,
  927,
  928,
  929,
  930,
  931,
  932,
  933,
  935,
  936,
  937,
  938,
  939,
  940,
  941,
  942,
  943,
  944,
  945,
  946,
  947,
  948,
  949,
  951,
  952,
  953,
  954,
  955,
  956,
  957,
  958,
  959,
  960,
  961,
  962,
  963,
  964,
  965,
  966,
  967,
  968,
  969,
  970,
  971,
  972,
  973,
  974,
  975,
  976,
  977,
  978,
  979,
  980,
  981,
  982,
  983,
  984,
  985,
  986,
  987,
  988,
  989,
  990,
  991,
  992,
  993,
  994,
  995,
  996,
  997,
  998,
  999,
  1000,
  1001,
  1002,
  1003,
  1004,
  1005,
  1006,
  1007,
  1008,
  1009,
  1010,
  1011,
  1012,
  1013,
  ...],
 1: [81,
  7634,
  9851,
  12415,
  13173,
  13251,
  15123,
  16235,
  17441,
  18967,
  22095,
  23943,
  26735,
  31818,
  35038,
  41444,
  50694,
  60134,
  62951,
  63399,
  66012,
  66814,
  70813,
  78037,
  86643,
  89281,
  90078,
  94072,
  95129],
 2: [128,
  11521,
  16609,
  22673,
  31435,
  44362,
  46680,
  48250,
  59815,
  66197,
  74790,
  82025,
  82087,
  83210,
  89766],
 3: [142,
  950,
  9133,
  10622,
  14301,
  38627,
  39087,
  39713,
  42726,
  46936,
  54457,
  60465,
  69235,
  70225,
  77128,
  78003],
 4: [279,
  10063,
  15699,
  15733,
  32392,
  51659,
  60054,
  61379,
  61464,
  66547,
  91484],
 5: [327,
  5508,
  8207,
  9115,
  11619,
  13206,
  25306,
  25484,
  29151,
  33717,
  42888,
  45287,
  61820,
  62841,
  70596,
  72085,
  72177,
  75750,
  77986,
  82148,
  85165,
  85228,
  88113,
  89724,
  93112,
  93602],
 6: [467,
  3474,
  11322,
  15140,
  18131,
  19305,
  23280,
  26486,
  32924,
  41610,
  43832,
  53320,
  53856,
  58755,
  67582,
  69995,
  70056,
  75771,
  75952,
  76392,
  79555,
  90954,
  91875],
 7: [497,
  4562,
  5454,
  5876,
  11775,
  21664,
  35639,
  36195,
  38161,
  38319,
  38817,
  39673,
  49132,
  50346,
  53204,
  60549,
  63925,
  65318,
  71037,
  85814,
  92200],
 8: [668,
  8992,
  11401,
  21093,
  26504,
  27141,
  36300,
  38757,
  58781,
  67498,
  77079,
  78696,
  79257,
  84102,
  87689],
 9: [686,
  18215,
  26244,
  27470,
  28415,
  39352,
  40052,
  66298,
  76077,
  81243,
  89600],
 10: [778,
  6818,
  7469,
  11916,
  23576,
  33867,
  34806,
  43694,
  56977,
  63715,
  65092,
  75521],
 11: [920,
  7447,
  14630,
  14684,
  18794,
  28565,
  30608,
  32533,
  33377,
  35879,
  35904,
  41755,
  42445,
  62388,
  73407,
  77503,
  78545,
  83549,
  83552,
  92152],
 12: [934,
  2554,
  4524,
  21000,
  21915,
  22564,
  31308,
  36829,
  47058,
  48079,
  48836,
  50146,
  51542,
  53318,
  54876,
  55612,
  55795,
  58118,
  58128,
  66533,
  76052,
  80372,
  96215],
 13: [1205,
  2217,
  3341,
  5497,
  5929,
  11337,
  11696,
  12446,
  13051,
  13392,
  13716,
  14594,
  16071,
  17086,
  20949,
  23298,
  24835,
  26443,
  27173,
  33756,
  35995,
  36570,
  36859,
  38433,
  42482,
  47079,
  47826,
  50339,
  52103,
  54176,
  55891,
  59618,
  61056,
  61434,
  63209,
  64916,
  66641,
  66992,
  67242,
  68576,
  69033,
  69101,
  69856,
  70233,
  70930,
  73335,
  74528,
  75617,
  78219,
  82281,
  83897,
  85571,
  87871,
  88349,
  89230,
  89325,
  90253,
  90948,
  94996,
  95765],
 14: [1267,
  10335,
  14240,
  24753,
  26059,
  29040,
  31255,
  33669,
  53593,
  59197,
  67030,
  67711,
  69004,
  71115,
  87629,
  89821,
  92787,
  93539],
 15: [1310, 1766, 19227, 25600, 40710, 48534, 69074, 71056, 79766, 89939],
 16: [1366,
  1857,
  4193,
  5303,
  7004,
  13026,
  17166,
  20497,
  24904,
  38372,
  39137,
  41267,
  42349,
  43007,
  44016,
  49885,
  50117,
  51372,
  57270,
  58008,
  65440,
  66252,
  72846,
  81684,
  82590,
  85265,
  85833,
  87035,
  87467,
  90091,
  90541,
  91327,
  92877,
  93449,
  94621],
 17: [1510,
  18757,
  40999,
  46511,
  49139,
  51217,
  53512,
  57454,
  86318,
  92394,
  95268],
 18: [1537, 6971, 18691, 23760, 28753, 50530, 62193, 78874, 81422, 89810],
 19: [1895, 16218, 36197, 40360, 48358, 55470, 63876, 65267, 75892, 90341],
 20: [1919,
  2488,
  14108,
  15782,
  18571,
  23523,
  26744,
  31168,
  39770,
  44333,
  46696,
  51054,
  59691,
  73532,
  81548,
  84027,
  84065,
  87457,
  91230,
  93976,
  95619],
 21: [2858,
  10343,
  15416,
  45431,
  48719,
  49385,
  53789,
  55729,
  55893,
  59072,
  66991,
  69807,
  79510,
  94184,
  95515],
 22: [2954, 3161, 26003, 51804, 58147, 62975, 80516, 82562, 89502, 95733],
 23: [3042, 12849, 30938, 30999, 31100, 54598, 60852, 82916, 83903, 95124],
 24: [3050,
  6289,
  7112,
  10830,
  18715,
  39301,
  43894,
  46291,
  50657,
  71358,
  74679,
  77578,
  92069,
  93288],
 25: [3908,
  12064,
  29461,
  32007,
  35660,
  37145,
  37160,
  49275,
  63589,
  64496,
  78230,
  82769,
  83140,
  84044,
  89364],
 26: [4073,
  7798,
  10712,
  11812,
  16369,
  17980,
  18530,
  22140,
  24790,
  25981,
  26379,
  31126,
  45020,
  46341,
  47981,
  49936,
  53299,
  54363,
  60819,
  64108,
  77991,
  80574,
  85387,
  87249],
 27: [4239, 5946, 7408, 12365, 29314, 49991, 50350, 55935, 72079, 82849],
 28: [4773,
  16903,
  22358,
  30390,
  42075,
  47423,
  50532,
  65637,
  76862,
  76920,
  94131,
  95242],
 29: [4856,
  5855,
  7870,
  17061,
  25429,
  37520,
  37581,
  44422,
  59465,
  63052,
  78296],
 30: [4938, 23369, 47239, 56920, 58105, 83586, 85785, 89098, 91430, 92199],
 31: [5622,
  43258,
  49282,
  50735,
  64398,
  70293,
  71702,
  77044,
  77720,
  82010,
  84355],
 32: [6316, 12469, 23525, 24711, 35633, 45766, 47315, 48889, 52559, 77816],
 33: [6575, 15203, 15338, 43936, 49227, 53169, 54450, 58300, 58961, 86485],
 34: [6702,
  8408,
  14476,
  27251,
  29311,
  32901,
  40365,
  49438,
  50926,
  55877,
  64177,
  72261,
  73297,
  95056,
  96204],
 35: [8847, 33778, 59417, 68300, 76754, 81814, 82633, 93250, 94251, 95024],
 36: [9538,
  17119,
  25936,
  28079,
  46361,
  47947,
  48975,
  67967,
  75126,
  86227,
  93241,
  93816,
  96301],
 37: [10606,
  12283,
  15538,
  16255,
  30504,
  38410,
  41590,
  51653,
  51912,
  67006,
  78256,
  81585,
  81797,
  88960,
  93638],
 38: [10741,
  13755,
  14820,
  15686,
  20126,
  23167,
  23533,
  24685,
  26332,
  30573,
  30598,
  34135,
  35615,
  41899,
  45460,
  47221,
  49711,
  59475,
  66518,
  69679,
  74290,
  75860,
  90106,
  90270],
 39: [10847,
  19660,
  22972,
  23397,
  27040,
  32212,
  34544,
  36264,
  45450,
  54064,
  54892,
  59566,
  62716,
  66574,
  70529,
  72537,
  74917,
  80458,
  81581,
  84051,
  84270,
  84453,
  86314,
  88753,
  94267,
  95471],
 40: [12228, 25012, 40193, 43093, 70978, 73890, 76713, 80221, 86278, 95910],
 41: [12748,
  13545,
  14955,
  18221,
  20947,
  22204,
  43181,
  46210,
  50722,
  51754,
  61849,
  62824,
  67491,
  70172,
  80400,
  92538,
  94177],
 42: [16086,
  18172,
  18578,
  22724,
  22883,
  37662,
  39765,
  51926,
  70750,
  71479,
  93952,
  95301],
 43: [16205,
  24239,
  25379,
  32473,
  35986,
  45937,
  63485,
  70852,
  73005,
  74612,
  74993,
  75597],
 44: [19321,
  23499,
  27603,
  28603,
  41728,
  49520,
  58899,
  65100,
  82933,
  85550,
  93607],
 45: [21380,
  25608,
  26964,
  36140,
  45123,
  52783,
  56581,
  58158,
  58342,
  74065,
  88275,
  94896,
  95105,
  95394]}

In [ ]:


In [4]:


In [7]:



Out[7]:
<function tsne.bh_sne>
/usr/local/lib/python2.7/dist-packages/matplotlib/font_manager.py:1287: UserWarning: findfont: Font family [u'monospace'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [28]:
cv = StratifiedShuffleSplit(y,n_iter=5,test_size=0.2)

#model = LogisticRegression(C=1.0,penalty='l2')
model = RandomForestClassifier(n_estimators=50)
buildModel(model,Xall,y,cv=cv, scoring='roc_auc', n_jobs=1,trainFull=False,verbose=True)


[CV] no parameters to be set .........................................
[CV] ................................ no parameters to be set -  21.8s
[CV] no parameters to be set .........................................
[CV] ................................ no parameters to be set -  21.1s
[CV] no parameters to be set .........................................
[CV] ................................ no parameters to be set -  21.7s
[CV] no parameters to be set .........................................
[CV] ................................ no parameters to be set -  22.1s
[CV] no parameters to be set .........................................
[CV] ................................ no parameters to be set -  21.6s
cv-score: 0.8212 +/- 0.0029
all scores: array([ 0.82190009,  0.82244834,  0.81610481,  0.82049188,  0.82490862])
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.8min finished
Out[28]:
array([ 0.82190009,  0.82244834,  0.81610481,  0.82049188,  0.82490862])

In [29]:
model.fit(Xall,y)
Xall['sim'] = model.predict_proba(Xall)[:,1]

Xtrain = Xall[len(Xtest.index):]
Xtest = Xall[:len(Xtest.index)]
print Xtrain['sim'].mean()
print Xtest['sim'].mean()


Xtrain['sim'].hist(bins=30)
Xtest['sim'].hist(bins=30)
y.hist(bins=30)
plt.show()


0.0891605066446
0.775759793729

In [24]:
ypred = model.predict_proba(Xtrain)

In [28]:
train_mask = ypred[:,1] > 0.1
print train_mask
plt.hist(ypred[:,1],bins=100)


[False  True  True ..., False False False]
Out[28]:
(array([  3.90740000e+04,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   3.56320000e+04,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.59990000e+04,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.51500000e+03,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          9.47000000e+02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   1.31000000e+02,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   2.10000000e+01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([ 0.   ,  0.007,  0.014,  0.021,  0.028,  0.035,  0.042,  0.049,
         0.056,  0.063,  0.07 ,  0.077,  0.084,  0.091,  0.098,  0.105,
         0.112,  0.119,  0.126,  0.133,  0.14 ,  0.147,  0.154,  0.161,
         0.168,  0.175,  0.182,  0.189,  0.196,  0.203,  0.21 ,  0.217,
         0.224,  0.231,  0.238,  0.245,  0.252,  0.259,  0.266,  0.273,
         0.28 ,  0.287,  0.294,  0.301,  0.308,  0.315,  0.322,  0.329,
         0.336,  0.343,  0.35 ,  0.357,  0.364,  0.371,  0.378,  0.385,
         0.392,  0.399,  0.406,  0.413,  0.42 ,  0.427,  0.434,  0.441,
         0.448,  0.455,  0.462,  0.469,  0.476,  0.483,  0.49 ,  0.497,
         0.504,  0.511,  0.518,  0.525,  0.532,  0.539,  0.546,  0.553,
         0.56 ,  0.567,  0.574,  0.581,  0.588,  0.595,  0.602,  0.609,
         0.616,  0.623,  0.63 ,  0.637,  0.644,  0.651,  0.658,  0.665,
         0.672,  0.679,  0.686,  0.693,  0.7  ]),
 <a list of 100 Patch objects>)

In [22]:
reducer = KMeans(init='k-means++', n_clusters=7, n_init=3,n_jobs=4)
pcAnalysis(Xtrain, Xtest, y=y, w=None, ncomp=7, transform=False, classification=False,reducer = reducer)


KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=3,
    n_jobs=4, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
PC analysis for train/test
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-3063eccba440> in <module>()
      1 reducer = KMeans(init='k-means++', n_clusters=7, n_init=3,n_jobs=4)
----> 2 pcAnalysis(Xtrain, Xtest, y=y, w=None, ncomp=7, transform=False, classification=False,reducer = reducer)

/home/loschen/calc/amimanera/qsprLib.pyc in pcAnalysis(X, Xtest, y, w, ncomp, transform, classification, reducer)
    925         plt.scatter(X_r[len(Xtest.index):, 0], X_r[len(Xtest.index):, 1], c='r', label="train", alpha=0.5)
    926         plt.scatter(X_r[:len(Xtest.index), 0], X_r[:len(Xtest.index), 1], c='g', label="test", alpha=0.5)
--> 927         print("Total variance:", np.sum(pca.explained_variance_ratio_))
    928         print("Explained variance:", pca.explained_variance_ratio_)
    929         plt.legend()

AttributeError: 'KMeans' object has no attribute 'explained_variance_ratio_'

In [23]:



In core: (93872,)
Noise: 804
cluster: 14

In [24]:
plt.hist(labels>0)

print db.core_sample_indices_
print n_clusters_


[    0     1     2 ..., 96317 96318 96319]
14

In [33]:
print db.labels_.shape


(96320,)

In [36]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels == k)

    xy = Xtrain.values[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = Xtrain.values[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()


/usr/local/lib/python2.7/dist-packages/matplotlib/lines.py:1107: UnicodeWarning: Unicode unequal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if self._markerfacecolor != fc:

In [ ]:
from tsne import bh_sne
X_2d = bh_sne(Xtrain)
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=ytrain)
bh_sne