In [3]:
import numpy as np
import scipy.sparse
import pandas as pd
from pandas import DataFrame,Series
from scipy.sparse import lil_matrix,vstack
from skmultilearn.dataset import Dataset

In [4]:
path_to_imdb_dataset = "/home/felipe/auto-tagger/data/imdb/IMDB-F.arff"

In [9]:
X,Y = Dataset.load_arff_to_numpy(path_to_imdb_dataset,labelcount=28,load_sparse=True)
X,Y


Out[9]:
(<120919x1001 sparse matrix of type '<type 'numpy.float64'>'
 	with 2343710 stored elements in LInked List format>,
 <120919x28 sparse matrix of type '<type 'numpy.int64'>'
 	with 241798 stored elements in LInked List format>)

In [10]:
df = DataFrame.from_records(Y.toarray())
num_unique_labelsets = len(df.drop_duplicates())
num_unique_labelsets


Out[10]:
4503

In [12]:
indices = Series(np.array(range(0,120919)), index = df.index)
df['id'] = indices
df


Out[12]:
0 1 2 3 4 5 6 7 8 9 ... 19 20 21 22 23 24 25 26 27 id
0 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 1
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
3 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 3
4 0 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
5 1 0 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 5
6 1 0 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 6
7 1 0 0 1 0 1 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 7
8 1 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 8
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 9
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 10
11 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 11
12 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 12
13 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 13
14 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 14
15 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 15
16 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 16
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 17
18 0 0 0 0 1 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 18
19 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 19
20 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 20
21 0 0 1 0 0 1 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 21
22 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 22
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 23
24 0 0 0 1 0 1 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 24
25 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 25
26 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 26
27 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 27
28 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 28
29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 29
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
120889 1 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120889
120890 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120890
120891 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120891
120892 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120892
120893 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120893
120894 0 0 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120894
120895 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120895
120896 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 120896
120897 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120897
120898 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120898
120899 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120899
120900 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120900
120901 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120901
120902 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120902
120903 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 120903
120904 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 120904
120905 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120905
120906 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 120906
120907 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 120907
120908 1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 120908
120909 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120909
120910 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 120910
120911 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 120911
120912 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 120912
120913 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120913
120914 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120914
120915 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120915
120916 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120916
120917 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120917
120918 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 120918

120919 rows × 29 columns


In [16]:
grouped = df.groupby(list(range(0,28))).count()

In [17]:
num_labelsets_appearing_only_once = len(grouped[grouped['id']==1])
num_labelsets_appearing_only_once


Out[17]:
2263