In [1]:
import scipy
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
In [2]:
data, meta = scipy.io.arff.loadarff('yeast/yeast-train.arff')
df = pd.DataFrame(data)
In [39]:
meta
Out[39]:
In [5]:
df.head()
Out[5]:
In [31]:
df.dtypes
Out[31]:
In [19]:
len(df.columns)
Out[19]:
In [23]:
labels_df = df.ix[:,103:117]
labels_df.head()
Out[23]:
In [33]:
for i in range(1,15):
class_name = 'Class'+str(i)
labels_df[class_name] = labels_df[class_name].astype('float64')
labels_df.dtypes
Out[33]:
In [35]:
# Here, I'm trying to check the correlations between labels
labels_df.apply(lambda s: labels_df.corrwith(s))
Out[35]:
In [1]:
# you can also generate a random multi-label dataset on your own
from sklearn.datasets import make_multilabel_classification
X, y = make_multilabel_classification(sparse = True, n_labels = 7,
return_indicator = 'sparse', allow_unlabeled = False)
In [4]:
matrix_df = pd.SparseDataFrame([ pd.SparseSeries(X[i].toarray().ravel())
for i in np.arange(X.shape[0])])
matrix_df.head()
Out[4]:
In [2]:
# Start Multi-Label Experiments From HERE
training_data, train_meta = scipy.io.arff.loadarff('yeast/yeast-train.arff')
train_df = pd.DataFrame(training_data)
testing_data, test_meta = scipy.io.arff.loadarff('yeast/yeast-test.arff')
test_df = pd.DataFrame(testing_data)
# sparse: If True, returns a sparse matrix, where sparse matrix means a matrix having a large number of zero elements.
# n_labels: The average number of labels for each instance.
# return_indicator: If ‘sparse’ return Y in the sparse binary indicator format.
# allow_unlabeled: If True, some instances might not belong to any class.
In [3]:
train_df.head()
Out[3]:
In [4]:
test_df.head()
Out[4]:
In [5]:
X_train = train_df.iloc[:,:103]
Y_train = train_df.iloc[:,103:]
X_test = test_df.iloc[:,:103]
Y_test = test_df.iloc[:,103:]
In [6]:
for i in range(1,15):
class_name = 'Class'+str(i)
Y_train[class_name] = Y_train[class_name].astype('float64')
Y_train.dtypes
Out[6]:
In [7]:
for i in range(1,15):
class_name = 'Class'+str(i)
Y_test[class_name] = Y_test[class_name].astype('float64')
Y_test.dtypes
Out[7]:
In [27]:
X_test.head()
Out[27]:
In [28]:
Y_test.head()
Out[28]:
In [29]:
# Method 1 - Problem Transformation - Binary Relavance
## Simply treat each label independently
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, Y_train)
# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions) # 0.10359869138495092, very low accuracy
Out[29]:
In [30]:
# Method 1 - Problem Transformation - Classifier Chains
## Each round predict 1 label column, and put it in feature columns for the next round of label prediction
## For this method, in order to achive higher accuracy, better to have higher corrlation between labels
## So try the labels correlation check as I did here
Y_train.apply(lambda s: Y_train.corrwith(s))
Out[30]:
In [31]:
Y_test.apply(lambda s: Y_test.corrwith(s))
Out[31]:
In [32]:
## Not very high correlation, let's check the accuracy from Classifier Chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
classifier = ClassifierChain(GaussianNB())
# train
classifier.fit(X_train, Y_train)
# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions) # 0.092693565976008724, even lower...
Out[32]:
In [33]:
# Method 1 - Problem Transformation - Label Powerset
## Group rows with exactly same label value set (such as 2 rows all have labels (0,1,0,1)), together
## Then give each label set a value, so that it will become 1-label prediction problem
## But, this method may suffer from data imbalance when it comes to real world dataset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
classifier = LabelPowerset(GaussianNB())
# train
classifier.fit(X_train, Y_train)
# predict
predictions = classifier.predict(X_test)
accuracy_score(Y_test,predictions) # 0.18647764449291168, a little higher...
Out[33]:
In [40]:
# Method 2 - Adapt Algorithms
## Scikit-Learn package: http://scikit.ml/api/api/skmultilearn.adapt.html#module-skmultilearn.adapt
from skmultilearn.adapt import MLkNN
# Scikit-learn Adapt Algorithms requires Dense/Sparse matrix as data input
X_train_matrix = scipy.sparse.csr_matrix(X_train.values)
Y_train_matrix = scipy.sparse.csr_matrix(Y_train.values)
X_test_matrix = scipy.sparse.csr_matrix(X_test.values)
Y_test_matrix = scipy.sparse.csr_matrix(Y_test.values)
classifier = MLkNN(k=10)
# train
classifier.fit(X_train_matrix, Y_train_matrix)
# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions) # 0.16684841875681569
Out[40]:
In [41]:
# increase k
classifier = MLkNN(k=20)
# train
classifier.fit(X_train_matrix, Y_train_matrix)
# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions) # 0.18102508178844057
Out[41]:
In [43]:
from skmultilearn.adapt import BRkNNaClassifier
classifier = BRkNNaClassifier(k=10)
# train
classifier.fit(X_train_matrix, Y_train_matrix)
# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions) # 0.10032715376226826
Out[43]:
In [9]:
# Method 3 - Ensembling Multi-Label
## Scikit-Learn emsembling: http://scikit.ml/api/classify.html#ensemble-approaches
## tools to install in order to get "graph_tool.all": https://gist.github.com/v-pravin/949fc18d58a560cf85d2
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
# construct base forest classifier
base_classifier = RandomForestClassifier()
# setup problem transformation approach with sparse matrices for random forest
problem_transform_classifier = LabelPowerset(classifier=base_classifier,
require_dense=[False, False])
# partition the label space using fastgreedy community detection
# on a weighted label co-occurrence graph with self-loops allowed
clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True,
include_self_edges=True)
# setup the ensemble metaclassifier
classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)
# train
classifier.fit(X_train_matrix, Y_train_matrix)
# predict
predictions = classifier.predict(X_test_matrix)
accuracy_score(Y_test_matrix,predictions)