In [1]:
import scipy
from scipy.io import arff
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

In [2]:
data, meta = scipy.io.arff.loadarff('yeast/yeast-train.arff')
df = pd.DataFrame(data)

In [39]:
meta


Out[39]:
Dataset: MultiLabelData
	Att1's type is numeric
	Att2's type is numeric
	Att3's type is numeric
	Att4's type is numeric
	Att5's type is numeric
	Att6's type is numeric
	Att7's type is numeric
	Att8's type is numeric
	Att9's type is numeric
	Att10's type is numeric
	Att11's type is numeric
	Att12's type is numeric
	Att13's type is numeric
	Att14's type is numeric
	Att15's type is numeric
	Att16's type is numeric
	Att17's type is numeric
	Att18's type is numeric
	Att19's type is numeric
	Att20's type is numeric
	Att21's type is numeric
	Att22's type is numeric
	Att23's type is numeric
	Att24's type is numeric
	Att25's type is numeric
	Att26's type is numeric
	Att27's type is numeric
	Att28's type is numeric
	Att29's type is numeric
	Att30's type is numeric
	Att31's type is numeric
	Att32's type is numeric
	Att33's type is numeric
	Att34's type is numeric
	Att35's type is numeric
	Att36's type is numeric
	Att37's type is numeric
	Att38's type is numeric
	Att39's type is numeric
	Att40's type is numeric
	Att41's type is numeric
	Att42's type is numeric
	Att43's type is numeric
	Att44's type is numeric
	Att45's type is numeric
	Att46's type is numeric
	Att47's type is numeric
	Att48's type is numeric
	Att49's type is numeric
	Att50's type is numeric
	Att51's type is numeric
	Att52's type is numeric
	Att53's type is numeric
	Att54's type is numeric
	Att55's type is numeric
	Att56's type is numeric
	Att57's type is numeric
	Att58's type is numeric
	Att59's type is numeric
	Att60's type is numeric
	Att61's type is numeric
	Att62's type is numeric
	Att63's type is numeric
	Att64's type is numeric
	Att65's type is numeric
	Att66's type is numeric
	Att67's type is numeric
	Att68's type is numeric
	Att69's type is numeric
	Att70's type is numeric
	Att71's type is numeric
	Att72's type is numeric
	Att73's type is numeric
	Att74's type is numeric
	Att75's type is numeric
	Att76's type is numeric
	Att77's type is numeric
	Att78's type is numeric
	Att79's type is numeric
	Att80's type is numeric
	Att81's type is numeric
	Att82's type is numeric
	Att83's type is numeric
	Att84's type is numeric
	Att85's type is numeric
	Att86's type is numeric
	Att87's type is numeric
	Att88's type is numeric
	Att89's type is numeric
	Att90's type is numeric
	Att91's type is numeric
	Att92's type is numeric
	Att93's type is numeric
	Att94's type is numeric
	Att95's type is numeric
	Att96's type is numeric
	Att97's type is numeric
	Att98's type is numeric
	Att99's type is numeric
	Att100's type is numeric
	Att101's type is numeric
	Att102's type is numeric
	Att103's type is numeric
	Class1's type is nominal, range is ('0', '1')
	Class2's type is nominal, range is ('0', '1')
	Class3's type is nominal, range is ('0', '1')
	Class4's type is nominal, range is ('0', '1')
	Class5's type is nominal, range is ('0', '1')
	Class6's type is nominal, range is ('0', '1')
	Class7's type is nominal, range is ('0', '1')
	Class8's type is nominal, range is ('0', '1')
	Class9's type is nominal, range is ('0', '1')
	Class10's type is nominal, range is ('0', '1')
	Class11's type is nominal, range is ('0', '1')
	Class12's type is nominal, range is ('0', '1')
	Class13's type is nominal, range is ('0', '1')
	Class14's type is nominal, range is ('0', '1')

In [5]:
df.head()


Out[5]:
Att1 Att2 Att3 Att4 Att5 Att6 Att7 Att8 Att9 Att10 ... Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
0 0.093700 0.139771 0.062774 0.007698 0.083873 -0.119156 0.073305 0.005510 0.027523 0.043477 ... 0 0 0 0 0 0 0 0 0 0
1 -0.022711 -0.050504 -0.035691 -0.065434 -0.084316 -0.378560 0.038212 0.085770 0.182613 -0.055544 ... 0 0 1 1 0 0 0 1 1 0
2 -0.090407 0.021198 0.208712 0.102752 0.119315 0.041729 -0.021728 0.019603 -0.063853 -0.053756 ... 0 0 0 0 0 0 0 1 1 0
3 -0.085235 0.009540 -0.013228 0.094063 -0.013592 -0.030719 -0.116062 -0.131674 -0.165448 -0.123053 ... 0 0 0 0 0 0 0 1 1 1
4 -0.088765 -0.026743 0.002075 -0.043819 -0.005465 0.004306 -0.055865 -0.071484 -0.159025 -0.111348 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 117 columns


In [31]:
df.dtypes


Out[31]:
Att1       float64
Att2       float64
Att3       float64
Att4       float64
Att5       float64
Att6       float64
Att7       float64
Att8       float64
Att9       float64
Att10      float64
Att11      float64
Att12      float64
Att13      float64
Att14      float64
Att15      float64
Att16      float64
Att17      float64
Att18      float64
Att19      float64
Att20      float64
Att21      float64
Att22      float64
Att23      float64
Att24      float64
Att25      float64
Att26      float64
Att27      float64
Att28      float64
Att29      float64
Att30      float64
            ...   
Att88      float64
Att89      float64
Att90      float64
Att91      float64
Att92      float64
Att93      float64
Att94      float64
Att95      float64
Att96      float64
Att97      float64
Att98      float64
Att99      float64
Att100     float64
Att101     float64
Att102     float64
Att103     float64
Class1      object
Class2      object
Class3      object
Class4      object
Class5      object
Class6      object
Class7      object
Class8      object
Class9      object
Class10     object
Class11     object
Class12     object
Class13     object
Class14     object
Length: 117, dtype: object

In [19]:
len(df.columns)


Out[19]:
117

In [23]:
labels_df = df.ix[:,103:117]
labels_df.head()


Out[23]:
Class1 Class2 Class3 Class4 Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 1 1 0 0 0 1 1 0
2 0 1 1 0 0 0 0 0 0 0 0 1 1 0
3 0 0 1 1 0 0 0 0 0 0 0 1 1 1
4 1 1 0 0 0 0 0 0 0 0 0 0 0 0

In [33]:
for i in range(1,15):
    class_name = 'Class'+str(i)
    labels_df[class_name] = labels_df[class_name].astype('float64')
    
labels_df.dtypes


Out[33]:
Class1     float64
Class2     float64
Class3     float64
Class4     float64
Class5     float64
Class6     float64
Class7     float64
Class8     float64
Class9     float64
Class10    float64
Class11    float64
Class12    float64
Class13    float64
Class14    float64
dtype: object

In [35]:
# Here, I'm trying to check the correlations between labels

labels_df.apply(lambda s: labels_df.corrwith(s))


Out[35]:
Class1 Class2 Class3 Class4 Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
Class1 1.000000 0.507232 -0.309405 -0.313830 -0.257625 -0.016337 -0.040895 -0.038894 0.005224 -0.116104 -0.113541 -0.142461 -0.131753 -0.081242
Class2 0.507232 1.000000 0.216886 -0.334276 -0.285610 -0.100926 -0.121585 -0.076037 0.053730 0.042503 -0.020448 -0.112380 -0.101452 -0.103496
Class3 -0.309405 0.216886 1.000000 0.435189 -0.310254 -0.284435 -0.240820 -0.180264 -0.005892 0.043173 -0.039968 0.048252 0.056433 0.146345
Class4 -0.313830 -0.334276 0.435189 1.000000 0.151685 -0.330798 -0.292469 -0.280890 -0.122692 -0.149018 -0.182061 0.199855 0.208159 0.160734
Class5 -0.257625 -0.285610 -0.310254 0.151685 1.000000 0.521967 -0.068450 -0.055528 -0.028495 -0.105581 -0.139253 0.119842 0.130416 -0.039531
Class6 -0.016337 -0.100926 -0.284435 -0.330798 0.521967 1.000000 0.426211 0.284887 0.008103 -0.022680 0.104809 0.045314 0.027331 -0.056094
Class7 -0.040895 -0.121585 -0.240820 -0.292469 -0.068450 0.426211 1.000000 0.770091 0.006746 -0.062572 0.096359 0.096621 0.071787 -0.054690
Class8 -0.038894 -0.076037 -0.180264 -0.280890 -0.055528 0.284887 0.770091 1.000000 0.445439 -0.060176 0.019236 0.030030 0.034797 -0.043823
Class9 0.005224 0.053730 -0.005892 -0.122692 -0.028495 0.008103 0.006746 0.445439 1.000000 0.143646 -0.023400 -0.123043 -0.116908 -0.008542
Class10 -0.116104 0.042503 0.043173 -0.149018 -0.105581 -0.022680 -0.062572 -0.060176 0.143646 1.000000 0.781024 -0.025295 -0.028544 -0.041319
Class11 -0.113541 -0.020448 -0.039968 -0.182061 -0.139253 0.104809 0.096359 0.019236 -0.023400 0.781024 1.000000 0.046081 -0.001408 -0.046468
Class12 -0.142461 -0.112380 0.048252 0.199855 0.119842 0.045314 0.096621 0.030030 -0.123043 -0.025295 0.046081 1.000000 0.979002 0.002733
Class13 -0.131753 -0.101452 0.056433 0.208159 0.130416 0.027331 0.071787 0.034797 -0.116908 -0.028544 -0.001408 0.979002 1.000000 0.004889
Class14 -0.081242 -0.103496 0.146345 0.160734 -0.039531 -0.056094 -0.054690 -0.043823 -0.008542 -0.041319 -0.046468 0.002733 0.004889 1.000000

In [1]:
# you can also generate a random multi-label dataset on your own

from sklearn.datasets import make_multilabel_classification

X, y = make_multilabel_classification(sparse = True, n_labels = 7,
return_indicator = 'sparse', allow_unlabeled = False)

In [4]:
matrix_df = pd.SparseDataFrame([ pd.SparseSeries(X[i].toarray().ravel()) 
                                   for i in np.arange(X.shape[0])])
matrix_df.head()


Out[4]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 2.0 1.0 1.0 6.0 0.0 4.0 2.0 4.0 3.0 1.0 1.0 5.0 3.0 4.0 4.0 3.0 3.0 4.0 2.0 3.0
1 1.0 3.0 2.0 2.0 3.0 3.0 3.0 6.0 1.0 2.0 2.0 0.0 1.0 5.0 3.0 1.0 3.0 1.0 2.0 4.0
2 0.0 3.0 3.0 2.0 2.0 6.0 1.0 1.0 4.0 1.0 2.0 0.0 1.0 3.0 5.0 2.0 2.0 5.0 3.0 1.0
3 5.0 3.0 1.0 3.0 3.0 2.0 2.0 4.0 2.0 1.0 1.0 4.0 0.0 0.0 2.0 5.0 3.0 4.0 2.0 1.0
4 1.0 6.0 4.0 2.0 7.0 5.0 1.0 7.0 3.0 1.0 0.0 0.0 4.0 1.0 2.0 0.0 3.0 2.0 5.0 1.0

In [2]:
# Start Multi-Label Experiments From HERE

training_data, train_meta = scipy.io.arff.loadarff('yeast/yeast-train.arff')
train_df = pd.DataFrame(training_data)

testing_data, test_meta = scipy.io.arff.loadarff('yeast/yeast-test.arff')
test_df = pd.DataFrame(testing_data)

# sparse: If True, returns a sparse matrix, where sparse matrix means a matrix having a large number of zero elements.
# n_labels:  The average number of labels for each instance.
# return_indicator: If ‘sparse’ return Y in the sparse binary indicator format.
# allow_unlabeled: If True, some instances might not belong to any class.

In [3]:
train_df.head()


Out[3]:
Att1 Att2 Att3 Att4 Att5 Att6 Att7 Att8 Att9 Att10 ... Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
0 0.093700 0.139771 0.062774 0.007698 0.083873 -0.119156 0.073305 0.005510 0.027523 0.043477 ... 0 0 0 0 0 0 0 0 0 0
1 -0.022711 -0.050504 -0.035691 -0.065434 -0.084316 -0.378560 0.038212 0.085770 0.182613 -0.055544 ... 0 0 1 1 0 0 0 1 1 0
2 -0.090407 0.021198 0.208712 0.102752 0.119315 0.041729 -0.021728 0.019603 -0.063853 -0.053756 ... 0 0 0 0 0 0 0 1 1 0
3 -0.085235 0.009540 -0.013228 0.094063 -0.013592 -0.030719 -0.116062 -0.131674 -0.165448 -0.123053 ... 0 0 0 0 0 0 0 1 1 1
4 -0.088765 -0.026743 0.002075 -0.043819 -0.005465 0.004306 -0.055865 -0.071484 -0.159025 -0.111348 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 117 columns


In [4]:
test_df.head()


Out[4]:
Att1 Att2 Att3 Att4 Att5 Att6 Att7 Att8 Att9 Att10 ... Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
0 0.004168 -0.170975 -0.156748 -0.142151 0.058781 0.026851 0.197719 0.041850 0.066938 -0.056617 ... 0 0 1 1 0 0 0 1 1 0
1 -0.103956 0.011879 -0.098986 -0.054501 -0.007970 0.049113 -0.030580 -0.077933 -0.080529 -0.016267 ... 0 0 0 0 0 0 0 0 0 0
2 0.509949 0.401709 0.293799 0.087714 0.011686 -0.006411 -0.006255 0.013646 -0.040666 -0.024447 ... 0 0 0 0 0 0 0 1 1 0
3 0.119092 0.004412 -0.002262 0.072254 0.044512 -0.051467 0.074686 -0.007670 0.079438 0.062184 ... 0 0 0 0 0 0 0 0 0 0
4 0.042037 0.007054 -0.069483 0.081015 -0.048207 0.089446 -0.004947 0.064456 -0.133387 0.068878 ... 1 1 0 0 0 0 0 0 0 0

5 rows × 117 columns


In [5]:
X_train = train_df.iloc[:,:103]
Y_train = train_df.iloc[:,103:]

X_test = test_df.iloc[:,:103]
Y_test = test_df.iloc[:,103:]

In [6]:
for i in range(1,15):
    class_name = 'Class'+str(i)
    Y_train[class_name] = Y_train[class_name].astype('float64')
    
Y_train.dtypes


Out[6]:
Class1     float64
Class2     float64
Class3     float64
Class4     float64
Class5     float64
Class6     float64
Class7     float64
Class8     float64
Class9     float64
Class10    float64
Class11    float64
Class12    float64
Class13    float64
Class14    float64
dtype: object

In [7]:
for i in range(1,15):
    class_name = 'Class'+str(i)
    Y_test[class_name] = Y_test[class_name].astype('float64')
    
Y_test.dtypes


Out[7]:
Class1     float64
Class2     float64
Class3     float64
Class4     float64
Class5     float64
Class6     float64
Class7     float64
Class8     float64
Class9     float64
Class10    float64
Class11    float64
Class12    float64
Class13    float64
Class14    float64
dtype: object

In [27]:
X_test.head()


Out[27]:
Att1 Att2 Att3 Att4 Att5 Att6 Att7 Att8 Att9 Att10 ... Att94 Att95 Att96 Att97 Att98 Att99 Att100 Att101 Att102 Att103
0 0.004168 -0.170975 -0.156748 -0.142151 0.058781 0.026851 0.197719 0.041850 0.066938 -0.056617 ... 0.006166 -0.012976 -0.014259 -0.015024 -0.010747 0.000411 -0.032056 -0.018312 0.030126 0.124722
1 -0.103956 0.011879 -0.098986 -0.054501 -0.007970 0.049113 -0.030580 -0.077933 -0.080529 -0.016267 ... 0.007680 0.027719 -0.085811 0.111123 0.050541 0.027565 -0.063569 -0.041471 -0.079758 0.017161
2 0.509949 0.401709 0.293799 0.087714 0.011686 -0.006411 -0.006255 0.013646 -0.040666 -0.024447 ... 0.096277 -0.044932 -0.089470 -0.009162 -0.012010 0.308378 -0.028053 0.026710 -0.066565 -0.122352
3 0.119092 0.004412 -0.002262 0.072254 0.044512 -0.051467 0.074686 -0.007670 0.079438 0.062184 ... -0.083809 0.200354 -0.075716 0.196605 0.152758 -0.028484 -0.074207 -0.089227 -0.049913 -0.043893
4 0.042037 0.007054 -0.069483 0.081015 -0.048207 0.089446 -0.004947 0.064456 -0.133387 0.068878 ... -0.060467 0.044351 -0.057209 0.028047 0.029661 -0.050026 0.023248 -0.061539 -0.035160 0.067834

5 rows × 103 columns


In [28]:
Y_test.head()


Out[28]:
Class1 Class2 Class3 Class4 Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
1 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
3 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [29]:
# Method 1 - Problem Transformation - Binary Relavance
## Simply treat each label independently

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X_train, Y_train)

# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions)   # 0.10359869138495092, very low accuracy


Out[29]:
0.10359869138495092

In [30]:
# Method 1 - Problem Transformation - Classifier Chains
## Each round predict 1 label column, and put it in feature columns for the next round of label prediction

## For this method, in order to achive higher accuracy, better to have higher corrlation between labels
## So try the labels correlation check as I did here

Y_train.apply(lambda s: Y_train.corrwith(s))


Out[30]:
Class1 Class2 Class3 Class4 Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
Class1 1.000000 0.507232 -0.309405 -0.313830 -0.257625 -0.016337 -0.040895 -0.038894 0.005224 -0.116104 -0.113541 -0.142461 -0.131753 -0.081242
Class2 0.507232 1.000000 0.216886 -0.334276 -0.285610 -0.100926 -0.121585 -0.076037 0.053730 0.042503 -0.020448 -0.112380 -0.101452 -0.103496
Class3 -0.309405 0.216886 1.000000 0.435189 -0.310254 -0.284435 -0.240820 -0.180264 -0.005892 0.043173 -0.039968 0.048252 0.056433 0.146345
Class4 -0.313830 -0.334276 0.435189 1.000000 0.151685 -0.330798 -0.292469 -0.280890 -0.122692 -0.149018 -0.182061 0.199855 0.208159 0.160734
Class5 -0.257625 -0.285610 -0.310254 0.151685 1.000000 0.521967 -0.068450 -0.055528 -0.028495 -0.105581 -0.139253 0.119842 0.130416 -0.039531
Class6 -0.016337 -0.100926 -0.284435 -0.330798 0.521967 1.000000 0.426211 0.284887 0.008103 -0.022680 0.104809 0.045314 0.027331 -0.056094
Class7 -0.040895 -0.121585 -0.240820 -0.292469 -0.068450 0.426211 1.000000 0.770091 0.006746 -0.062572 0.096359 0.096621 0.071787 -0.054690
Class8 -0.038894 -0.076037 -0.180264 -0.280890 -0.055528 0.284887 0.770091 1.000000 0.445439 -0.060176 0.019236 0.030030 0.034797 -0.043823
Class9 0.005224 0.053730 -0.005892 -0.122692 -0.028495 0.008103 0.006746 0.445439 1.000000 0.143646 -0.023400 -0.123043 -0.116908 -0.008542
Class10 -0.116104 0.042503 0.043173 -0.149018 -0.105581 -0.022680 -0.062572 -0.060176 0.143646 1.000000 0.781024 -0.025295 -0.028544 -0.041319
Class11 -0.113541 -0.020448 -0.039968 -0.182061 -0.139253 0.104809 0.096359 0.019236 -0.023400 0.781024 1.000000 0.046081 -0.001408 -0.046468
Class12 -0.142461 -0.112380 0.048252 0.199855 0.119842 0.045314 0.096621 0.030030 -0.123043 -0.025295 0.046081 1.000000 0.979002 0.002733
Class13 -0.131753 -0.101452 0.056433 0.208159 0.130416 0.027331 0.071787 0.034797 -0.116908 -0.028544 -0.001408 0.979002 1.000000 0.004889
Class14 -0.081242 -0.103496 0.146345 0.160734 -0.039531 -0.056094 -0.054690 -0.043823 -0.008542 -0.041319 -0.046468 0.002733 0.004889 1.000000

In [31]:
Y_test.apply(lambda s: Y_test.corrwith(s))


Out[31]:
Class1 Class2 Class3 Class4 Class5 Class6 Class7 Class8 Class9 Class10 Class11 Class12 Class13 Class14
Class1 1.000000 0.530037 -0.253155 -0.293891 -0.273892 -0.090011 -0.153002 -0.148226 -0.057984 -0.091622 -0.073865 -0.133661 -0.135088 -0.040910
Class2 0.530037 1.000000 0.236645 -0.355494 -0.284067 -0.128472 -0.197395 -0.129462 0.091478 0.084874 -0.007370 -0.131636 -0.124922 -0.047932
Class3 -0.253155 0.236645 1.000000 0.416407 -0.340189 -0.284757 -0.212490 -0.153393 0.058039 0.113071 -0.016304 0.005845 0.016443 0.140966
Class4 -0.293891 -0.355494 0.416407 1.000000 0.107819 -0.329391 -0.259612 -0.261487 -0.151287 -0.129381 -0.142475 0.159624 0.167864 0.159937
Class5 -0.273892 -0.284067 -0.340189 0.107819 1.000000 0.598581 0.029576 0.008569 -0.046228 -0.095993 -0.070301 0.164892 0.172030 -0.019682
Class6 -0.090011 -0.128472 -0.284757 -0.329391 0.598581 1.000000 0.398406 0.222871 -0.028150 -0.016785 0.122053 0.086804 0.063857 -0.067171
Class7 -0.153002 -0.197395 -0.212490 -0.259612 0.029576 0.398406 1.000000 0.732111 -0.035740 -0.054122 0.089097 0.128627 0.101206 -0.032685
Class8 -0.148226 -0.129462 -0.153393 -0.261487 0.008569 0.222871 0.732111 1.000000 0.431449 -0.046145 0.009393 0.022942 0.029194 -0.038793
Class9 -0.057984 0.091478 0.058039 -0.151287 -0.046228 -0.028150 -0.035740 0.431449 1.000000 0.244059 -0.012136 -0.160897 -0.155884 -0.037074
Class10 -0.091622 0.084874 0.113071 -0.129381 -0.095993 -0.016785 -0.054122 -0.046145 0.244059 1.000000 0.763278 -0.067295 -0.062639 -0.040046
Class11 -0.073865 -0.007370 -0.016304 -0.142475 -0.070301 0.122053 0.089097 0.009393 -0.012136 0.763278 1.000000 -0.010742 -0.048346 -0.039803
Class12 -0.133661 -0.131636 0.005845 0.159624 0.164892 0.086804 0.128627 0.022942 -0.160897 -0.067295 -0.010742 1.000000 0.985657 -0.037370
Class13 -0.135088 -0.124922 0.016443 0.167864 0.172030 0.063857 0.101206 0.029194 -0.155884 -0.062639 -0.048346 0.985657 1.000000 -0.035603
Class14 -0.040910 -0.047932 0.140966 0.159937 -0.019682 -0.067171 -0.032685 -0.038793 -0.037074 -0.040046 -0.039803 -0.037370 -0.035603 1.000000

In [32]:
## Not very high correlation, let's check the accuracy from Classifier Chains

from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB


classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(X_train, Y_train)

# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions)   # 0.092693565976008724, even lower...


Out[32]:
0.092693565976008724

In [33]:
# Method 1 - Problem Transformation - Label Powerset
## Group rows with exactly same label value set (such as 2 rows all have labels (0,1,0,1)), together
## Then give each label set a value, so that it will become 1-label prediction problem
## But, this method may suffer from data imbalance when it comes to real world dataset

from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB


classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(X_train, Y_train)

# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions)   # 0.18647764449291168, a little higher...


Out[33]:
0.18647764449291168

In [40]:
# Method 2 - Adapt Algorithms
## Scikit-Learn package: http://scikit.ml/api/api/skmultilearn.adapt.html#module-skmultilearn.adapt

from skmultilearn.adapt import MLkNN

# Scikit-learn Adapt Algorithms requires Dense/Sparse matrix as data input
X_train_matrix = scipy.sparse.csr_matrix(X_train.values)
Y_train_matrix = scipy.sparse.csr_matrix(Y_train.values)

X_test_matrix = scipy.sparse.csr_matrix(X_test.values)
Y_test_matrix = scipy.sparse.csr_matrix(Y_test.values)

classifier = MLkNN(k=10)

# train
classifier.fit(X_train_matrix, Y_train_matrix)

# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions)   # 0.16684841875681569


Out[40]:
0.16684841875681569

In [41]:
# increase k
classifier = MLkNN(k=20)

# train
classifier.fit(X_train_matrix, Y_train_matrix)

# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions)   # 0.18102508178844057


Out[41]:
0.18102508178844057

In [43]:
from skmultilearn.adapt import BRkNNaClassifier

classifier = BRkNNaClassifier(k=10)

# train
classifier.fit(X_train_matrix, Y_train_matrix)

# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions)   # 0.10032715376226826


Out[43]:
0.10032715376226826

In [9]:
# Method 3 - Ensembling Multi-Label
## Scikit-Learn emsembling: http://scikit.ml/api/classify.html#ensemble-approaches
## tools to install in order to get "graph_tool.all": https://gist.github.com/v-pravin/949fc18d58a560cf85d2

from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

# construct base forest classifier
base_classifier = RandomForestClassifier()

# setup problem transformation approach with sparse matrices for random forest
problem_transform_classifier = LabelPowerset(classifier=base_classifier,
    require_dense=[False, False])

# partition the label space using fastgreedy community detection
# on a weighted label co-occurrence graph with self-loops allowed
clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True,
    include_self_edges=True)

# setup the ensemble metaclassifier
classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)

# train
classifier.fit(X_train_matrix, Y_train_matrix)

# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions)


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-9-5a37a197d9a0> in <module>()
      3 from sklearn.ensemble import RandomForestClassifier
      4 from skmultilearn.problem_transform import LabelPowerset
----> 5 from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
      6 from skmultilearn.ensemble import LabelSpacePartitioningClassifier
      7 

build/bdist.macosx-10.7-x86_64/egg/skmultilearn/cluster/__init__.py in <module>()

build/bdist.macosx-10.7-x86_64/egg/skmultilearn/cluster/graphtool.py in <module>()

ImportError: No module named graph_tool.all