In [3]:
import numpy as np
import scipy.sparse
import pandas as pd
from pandas import DataFrame,Series
from scipy.sparse import lil_matrix,vstack
from skmultilearn.dataset import Dataset
In [4]:
path_to_imdb_dataset = "/home/felipe/auto-tagger/data/imdb/IMDB-F.arff"
In [9]:
X,Y = Dataset.load_arff_to_numpy(path_to_imdb_dataset,labelcount=28,load_sparse=True)
X,Y
Out[9]:
(<120919x1001 sparse matrix of type '<type 'numpy.float64'>'
with 2343710 stored elements in LInked List format>,
<120919x28 sparse matrix of type '<type 'numpy.int64'>'
with 241798 stored elements in LInked List format>)
In [10]:
df = DataFrame.from_records(Y.toarray())
num_unique_labelsets = len(df.drop_duplicates())
num_unique_labelsets
Out[10]:
4503
In [12]:
indices = Series(np.array(range(0,120919)), index = df.index)
df['id'] = indices
df
Out[12]:
0
1
2
3
4
5
6
7
8
9
...
19
20
21
22
23
24
25
26
27
id
0
0
0
0
0
1
0
0
0
0
0
...
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
1
0
0
0
0
1
2
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
2
3
0
0
0
0
0
0
0
0
0
1
...
0
0
0
0
0
0
0
0
0
3
4
0
0
0
0
0
1
1
0
0
0
...
0
0
0
0
0
0
0
0
0
4
5
1
0
0
1
0
0
0
0
0
1
...
0
0
0
0
0
1
0
0
0
5
6
1
0
0
1
0
0
0
0
0
1
...
0
0
0
0
0
1
0
0
0
6
7
1
0
0
1
0
1
0
0
0
1
...
0
0
0
0
0
1
0
0
0
7
8
1
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
8
9
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
1
0
0
0
0
9
10
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
1
0
0
10
11
0
0
0
0
0
0
1
0
0
0
...
0
0
0
0
0
0
0
0
0
11
12
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
12
13
0
0
0
0
1
0
0
0
0
0
...
0
0
0
0
0
0
1
0
0
13
14
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
14
15
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
15
16
0
1
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
16
17
0
0
0
0
0
0
0
0
0
0
...
0
0
0
1
0
0
0
0
0
17
18
0
0
0
0
1
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
18
19
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
19
20
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
20
21
0
0
1
0
0
1
0
0
0
0
...
0
1
0
0
0
0
0
0
0
21
22
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
22
23
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
1
0
23
24
0
0
0
1
0
1
0
0
0
0
...
0
0
0
1
0
0
0
0
0
24
25
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
25
26
0
0
0
0
1
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
26
27
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
27
28
1
1
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
28
29
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
29
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
120889
1
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120889
120890
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120890
120891
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120891
120892
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120892
120893
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120893
120894
0
0
1
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120894
120895
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120895
120896
0
0
0
0
0
0
0
0
0
1
...
0
0
0
0
0
0
0
0
0
120896
120897
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120897
120898
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120898
120899
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120899
120900
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120900
120901
0
0
1
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120901
120902
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120902
120903
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
120903
120904
0
0
0
0
0
0
0
0
0
0
...
0
0
0
1
0
0
0
0
0
120904
120905
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120905
120906
0
0
0
0
0
0
0
0
0
0
...
0
0
1
0
0
0
0
0
0
120906
120907
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
120907
120908
1
0
0
0
0
0
0
1
0
0
...
0
0
0
0
0
0
0
0
0
120908
120909
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120909
120910
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
120910
120911
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
120911
120912
0
0
0
0
0
0
0
0
0
0
...
1
0
0
0
0
0
0
0
0
120912
120913
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120913
120914
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120914
120915
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120915
120916
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120916
120917
0
0
0
0
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120917
120918
0
0
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
120918
120919 rows × 29 columns
In [16]:
grouped = df.groupby(list(range(0,28))).count()
In [17]:
num_labelsets_appearing_only_once = len(grouped[grouped['id']==1])
num_labelsets_appearing_only_once
Out[17]:
2263
Content source: queirozfcom/python-sandbox
Similar notebooks: