In [1]:

    
%matplotlib inline
%load_ext sql



In [2]:

    
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [3]:

    
sns.set(rc={"figure.figsize": (10, 6)})
sns.set_style("whitegrid")
sns.set_context("notebook")



In [4]:

    
posts = pd.read_hdf("metadata.h5", "posts")
tag_count = pd.read_hdf("metadata.h5", "tag_count")

Manually remove some tags that are not obvious from the images

Or that do not matter for our learning goals.



In [5]:

    
useless_tags = [
    "lowres",
    "highres",
    "bad_id",
    "bad_pixiv_id",
    "monochrome",
    "censored",
    "alternate_costume",
    "hetero",
    "sketch",
    "yuri",
    "character_name",
    "greyscale",
    "artist_name",
    "artist_request",
    "artist_request",
    "copyright_request",
    "absurdres",
    "dated",
    "signature",
    "cosplay",
    "translated",
    "copyright_name",
    "traditional_media",
    "twitter_username",
    "alternate_hairstyle",
    "mosaic_censoring",
    "parody",
    "english",
    "gradient",
    "couple",
    "sisters",
    "cover_page",
    "crossover",
    "uncensored",
    "official_art",
    "letterboxed",
    "translation_request",
    "scan",
    "game_cg",
    "remodel_(kantai_collection)",
    "depth_of_field",
    "convenient_censoring",
    "foreshortening",
    "watermark",
    "genderswap",
    "adapted_costume",
    "pov",
    "wallpaper",
    "pokemon_(creature)",
    "text",
    "kemonomimi_mode",
    "shinkaisei-kan",
    "genderswap_(mtf)",
    "personification",
    "blurry",
    "wind",
    "younger",
    "cover",
    # Tags for the background
    "white_background",
    "grey_background",
    "gradient_background",
    "blue_background",
    "pink_background",
    # And this is about the "camera" position
    "cowboy shot",
    "dutch_angle",
    "full_body",
    "upper_body"
]



In [6]:

    
# The strange indices here are necessary because I want the 'kiss' tag that wouldn't be included otherwise.
tag_count_filtered = tag_count[tag_count.name.isin(useless_tags) == False].iloc[:509]
tag_count_filtered.index = pd.RangeIndex(1, len(tag_count_filtered) + 1)
# Also add the three ratings to the tags!



In [7]:

    
print(tag_count_filtered.to_string())









    



           id                           name    count
1           4                          1girl  1523817
2          36                           solo  1314438
3          18                      long_hair   951022
4         210                        breasts   628220
5           8                          blush   601991
6          59                          smile   587298
7         184                     short_hair   581007
8          20              looking_at_viewer   466356
9         412                     open_mouth   451815
10         69                      blue_eyes   442677
11         94                    blonde_hair   436842
12         10                     brown_hair   404691
13        180                 multiple_girls   394516
14         35                          skirt   379425
15        134                     thighhighs   348087
16         28                       red_eyes   337376
17        100                            hat   334600
18         93                     black_hair   332624
19        225                  large_breasts   311943
20         30                         ribbon   278870
21        169                         2girls   270690
22        785                         gloves   266168
23         45                          dress   264096
24         99                  hair_ornament   256368
25        427                     brown_eyes   255377
26          9                            bow   246489
27        114                      twintails   241604
28         31                 school_uniform   229344
29         87                      underwear   229247
30         34              simple_background   226689
31         90                           1boy   225371
32        407                     green_eyes   217244
33        124                          navel   210300
34        574                      blue_hair   207161
35        383                        sitting   201977
36         81                        panties   198803
37        281                 medium_breasts   195581
38       1344                       cleavage   191663
39        606                    animal_ears   188988
40        289                          shirt   173592
41         55                    purple_eyes   169097
42        789                        jewelry   165226
43        557                         weapon   163563
44        438                 very_long_hair   162347
45        232                        nipples   156102
46        484                 bare_shoulders   155967
47        147                    hair_ribbon   154948
48         19                   long_sleeves   154779
49        329                    purple_hair   148267
50          6                  black_legwear   146693
51        197                          bangs   145232
52        465                      pink_hair   145095
53        934                    yellow_eyes   142528
54         84                       ponytail   141141
55        358                    closed_eyes   137079
56         57                    silver_hair   133115
57       2496                         flower   131860
58        649                           tail   130156
59         14                       hair_bow   129047
60       1649                       swimsuit   128628
61         68                            ass   124217
62        667                          boots   120821
63        501                       red_hair   119591
64        917                          wings   119431
65        151                      pantyhose   118776
66         74                     green_hair   117396
67       1207                       hairband   116572
68        526                          braid   114917
69        491                 one_eye_closed   106445
70       7818                     male_focus   103141
71        305                          ahoge   102941
72       2331               detached_sleeves   102704
73         97                        glasses   101661
74       4902               japanese_clothes   100950
75        395                             :d    99338
76       6166                          heart    98451
77        183                       serafuku    97791
78         48                        holding    97469
79        198                       barefoot    96782
80        404                           food    96370
81        462                           nude    96132
82         61                       standing    94281
83        953                        necktie    92022
84         21                          lying    90858
85        619                     collarbone    88468
86       1629                         bikini    88331
87        588                  multiple_boys    87310
88        416                          shoes    86116
89        555                          sword    83992
90       3492                     white_hair    82783
91       6546                         jacket    80592
92       1896                        midriff    80587
93        335                  small_breasts    80370
94        153                  pleated_skirt    79969
95        646                        striped    79775
96        137                  white_legwear    77954
97        784                   elbow_gloves    77537
98        334                            sky    76060
99        315  eyebrows_visible_through_hair    75110
100      4447                         frills    74022
101       290                  short_sleeves    73655
102      1139                          tears    72525
103        79                   looking_back    72269
104       783                       earrings    71109
105       245                          sweat    70034
106      1233                   open_clothes    67010
107      4423                    pointy_ears    65306
108       879                           fang    65213
109       595                         shorts    64656
110       314                            day    64169
111       220              hair_between_eyes    63336
112       830                          pussy    63318
113       616                       cat_ears    62933
114      1782                       hairclip    61823
115      1994                          penis    61295
116       243                     solo_focus    60048
117      2100                         choker    59712
118       653                         tongue    58902
119       269                    cowboy_shot    58741
120      2809                           belt    58393
121       718                      pink_eyes    58038
122       313                          cloud    57784
123       445                         3girls    57062
124      1025                     flat_chest    54366
125     10907              fingerless_gloves    54178
126      2698                          scarf    54034
127       571                          2boys    53787
128       300                 zettai_ryouiki    53402
129       267                   closed_mouth    51480
130      4275                           cape    51471
131        43                          chibi    51200
132       106                          socks    51162
133       499                  puffy_sleeves    51093
134       664                      aqua_eyes    51051
135       138                  white_panties    51044
136      6367                          horns    50403
137       108                           star    50212
138      9202                    hair_flower    49547
139       473                          water    49137
140      4903                         kimono    48663
141      7024                     black_eyes    48500
142       327                    parted_lips    48245
143       792              multicolored_hair    48127
144      2117                            cum    47573
145       471                  side_ponytail    47377
146      3320                   white_gloves    47275
147       521                          armor    46953
148      1108                    spread_legs    46403
149       326                       outdoors    46101
150       652                         thighs    45909
151      2066                    twin_braids    45612
152       493                    orange_hair    45183
153      5734                        uniform    45086
154       193                             :o    44368
155       481                        armpits    44031
156        33                      sidelocks    43666
157      4901                   huge_breasts    43414
158       505                     sleeveless    43263
159      1227                            bra    43061
160      2504                       necklace    42420
161       219                           feet    42258
162      1399                        on_back    42141
163     12081                   wide_sleeves    41855
164        92                            bag    41559
165       782                covered_nipples    41440
166      1711                     bunny_ears    41309
167       282                      miniskirt    41249
168      2631                           grin    40882
169       817                    from_behind    40402
170       654                     tongue_out    39622
171       963                          apron    39467
172      7435                           hood    38492
173       759                      aqua_hair    38093
174      1549                           legs    38060
175       229                           lips    37976
176      2711                            sex    37944
177      1962                          teeth    37883
178       296                           vest    37558
179      9811                      dark_skin    37475
180       733                           book    37328
181      6153                   black_gloves    36969
182       299                    white_shirt    36928
183     13686                       bracelet    36870
184       670                      grey_hair    35648
185       175                            hug    35567
186      1234                     open_shirt    35375
187        16                      kneehighs    35356
188       950                           maid    34570
189      5380                            gun    34476
190      1490                         petals    34434
191       820                     high_heels    34309
192      4919                          shiny    34163
193     15835                          pants    33879
194       618                       cat_tail    33645
195      1783                       kneeling    33426
196      2345                    two_side_up    33413
197       391                           tree    33158
198      2379              hair_over_one_eye    33100
199      2269                    wrist_cuffs    32788
200      2053             one-piece_swimsuit    32646
201      3846                   magical_girl    32308
202       597                      sweatdrop    31677
203        26                          plaid    31418
204       264                    blunt_bangs    31135
205      3664                    nail_polish    30819
206      2637                         no_bra    30487
207      8952                       military    30471
208       352                         bowtie    30430
209      2276                        arms_up    30403
210       765                      pantyshot    30268
211      2124                           loli    30055
212      1307                    hand_on_hip    29964
213       793                     no_panties    29923
214      2714                        vaginal    29892
215      8096                     headphones    29813
216      4346                         collar    29769
217      3037                          ascot    29654
218      4918                           sash    29632
219       406                          fruit    29513
220       254                            wet    29248
221       537                       eyebrows    29152
222      1674                      witch_hat    29106
223      1739                        sweater    28799
224       185                       siblings    28719
225      8474                striped_legwear    27994
226       951                 maid_headdress    27684
227      2343                   torn_clothes    27679
228      2091                      underboob    27164
229      3865                  holding_hands    26532
230      2659                     bottomless    26514
231      1167                       sideboob    26124
232       310                          blood    26082
233      8953               military_uniform    26078
234      2975                         4girls    26074
235       201                            bed    26060
236      1670                    orange_eyes    25987
237      1210                         pillow    25837
238       869                       umbrella    25751
239      2723                       bodysuit    25709
240     21821                    white_dress    25590
241      2430                           mole    25515
242     11320                   off_shoulder    25339
243       509                              v    25260
244       307                         arm_up    25232
245      7179                          groin    24803
246      4584                    see-through    24789
247      9635                          chain    24705
248      6549             looking_at_another    24699
249       968                            cup    24660
250      4898                       headband    24622
251       429                     hair_tubes    24580
252     33514                  sailor_collar    24550
253       402                     drill_hair    24451
254      3443                           rose    24446
255      2699                   single_braid    24185
256        12                        capelet    24069
257      1403                         saliva    23859
258       890                           moon    23851
259      1230                       lingerie    23837
260       316                      from_side    23764
261       702                         wariza    23740
262      1421                  black_panties    23674
263       837                    thigh_boots    23584
264      2059                school_swimsuit    23582
265      3104                        6+girls    23560
266       224                        indoors    23525
267      1096                       cameltoe    23437
268      2252                      bat_wings    23382
269      1194                     mouth_hold    23261
270      4921                     shiny_skin    23245
271      2796                    pussy_juice    23227
272      2328                    black_skirt    23054
273       237                     pubic_hair    23012
274       976                        profile    22999
275       252                        topless    22747
276      1987                     double_bun    22625
277      3307                        leotard    22540
278       280                         makeup    22440
279       900            puffy_short_sleeves    22313
280     10137                        glowing    21971
281      1644                side-tie_bikini    21528
282       548                 holding_weapon    21463
283      1616                     skirt_lift    21163
284      2630                    dress_shirt    21057
285      2849                             :3    21003
286       648                striped_panties    20993
287      4081                           bell    20976
288      8255                          night    20969
289       283                    neckerchief    20926
290        98                   hair_bobbles    20867
291      2332                       eyepatch    20698
292      2500                           leaf    20687
293       846                            ^_^    20660
294     28071                       headgear    20636
295      8086                      bare_legs    20619
296     16217                           coat    20580
297      7436                         hoodie    20477
298       857                     hat_ribbon    20454
299      2181                     turtleneck    20443
300       199                           bdsm    20423
301     77462                      eyelashes    20403
302       119                           bird    20375
303      2087                     shirt_lift    20368
304      1779                           face    20323
305     19118                           mask    20238
306      4475                    black_dress    20113
307         5                    arm_support    20078
308       319                      head_tilt    19999
309      8468                    plaid_skirt    19870
310       206                          bound    19819
311      4452                leaning_forward    19788
312     24972           symbol-shaped_pupils    19669
313     85640                  two-tone_hair    19419
314      1877                 underwear_only    19341
315      1482                       fox_ears    19246
316       690                     blue_skirt    19238
317      1765                        sparkle    19234
318      1547                         katana    19119
319  15934715              v-shaped_eyebrows    19064
320      7904                           back    18988
321      1250                     undressing    18922
322       986                         window    18694
323      6169                     knee_boots    18610
324      2943                      grey_eyes    18596
325      1483                       fox_tail    18531
326      3568                  heterochromia    18387
327       818                  garter_straps    18311
328      1063                     from_above    18301
329     12075                      skirt_set    18287
330      2355                            cat    18195
331      1959                short_twintails    18189
332      3670                        sandals    18158
333       311                 blush_stickers    18143
334       204                        bondage    18034
335      6599                         eating    17978
336      1693              outstretched_arms    17941
337      2324                       bandages    17891
338       111                    stuffed_toy    17879
339      4481                         formal    17859
340       693                    light_smile    17849
341     15838                         tattoo    17733
342      6538                         blazer    17708
343       457                       grabbing    17626
344       272                          frown    17596
345        25                        on_side    17549
346     10827                          beret    17469
347      1892                   crossed_arms    17460
348      2756                   cum_in_pussy    17435
349       196               arms_behind_back    17231
350      3048                chinese_clothes    17195
351       195                        areolae    17182
352     70536                       blue_sky    17167
353     15102                       lipstick    17153
354      9816                    fingernails    17143
355       373                        red_bow    17074
356      1348                    embarrassed    17012
357      5741                           anus    16964
358      1511                        polearm    16899
359      4911                            obi    16881
360       650                      thigh_gap    16843
361       903                     red_ribbon    16745
362    136030                      tokin_hat    16736
363       268                 collared_shirt    16732
364       594                   short_shorts    16551
365       385                       sleeping    16517
366      1533                          happy    16507
367       786                  gradient_hair    16423
368     72416                      wolf_ears    16422
369      1163                     panty_pull    16317
370      9346                          beach    16001
371      1420                      bent_over    15988
372       109                 stuffed_animal    15964
373      9127                       crop_top    15952
374       450                    breast_grab    15915
375      2851                             ;d    15905
376      1479                cherry_blossoms    15904
377       889                        mob_cap    15753
378       357                          chair    15719
379       270                 expressionless    15691
380     13636                     from_below    15671
381     18751                     suspenders    15652
382     13199                          ocean    15635
383      6151                    bike_shorts    15627
384      2431                 mole_under_eye    15582
385       524                     blue_dress    15578
386       496                      polka_dot    15524
387      8203                          staff    15496
388        32                         shadow    15492
389     37428                     wavy_mouth    15489
390      2299                   antenna_hair    15466
391    166327                      third_eye    15460
392       374                   red_neckwear    15449
393      3029                         5girls    15445
394      2352                         animal    15437
395       665                 black_footwear    15369
396      4415                     instrument    15286
397      3303                          hands    15273
398       297                      wavy_hair    15256
399       249                           toes    15253
400       592                    otoko_no_ko    15241
401     11791                        stomach    15105
402     11936                   crossed_legs    14997
403     10892                          child    14979
404      8118                         casual    14929
405      2991                       crescent    14928
406        80                       no_pants    14850
407      2948                          knife    14840
408     30737                    facial_mark    14824
409       832                    short_dress    14822
410     15101                         helmet    14796
411      4491                           suit    14705
412      1648                      strapless    14662
413      1486                 multiple_tails    14585
414      2279                           fire    14568
415        83                   pink_panties    14524
416     12950                       fur_trim    14410
417       212                      checkered    14375
418      8330                detached_collar    14333
419      2118                    cum_on_body    14262
420      8934                cleavage_cutout    14126
421      9948                          fangs    14112
422      6291                    facial_hair    14107
423      1937                   looking_away    14006
424     11326            sleeves_past_wrists    13907
425      3342                       feathers    13874
426      9326                       covering    13852
427     12641            looking_to_the_side    13842
428      1993                           oral    13833
429        11                        buttons    13825
430     11422                       backpack    13819
431      3545               side-tie_panties    13793
432       542                      gauntlets    13776
433       113                       sunlight    13761
434       136                        upskirt    13727
435     18063                           scar    13711
436     12923                     head_wings    13639
437      2547                      bed_sheet    13599
438      1322                            tan    13502
439      1671                       pointing    13462
440     39842                   monster_girl    13446
441      6864               outstretched_arm    13435
442      3136                     mary_janes    13366
443      4236                          3boys    13300
444       413                       pink_bow    13294
445       107                      squatting    13272
446     34230                      butterfly    13255
447      1488                    no_headwear    13134
448      8332                            fan    13082
449       806                      all_fours    13056
450       768                    thigh_strap    13032
451     10743                      bunnysuit    13013
452        29                      red_skirt    13009
453     19656                   floral_print    12969
454      9707                        goggles    12964
455      1423                   breast_press    12954
456      1812                           lace    12926
457      7425                          denim    12901
458      1717                     microphone    12860
459      2145                             :<    12804
460      3348                      pale_skin    12774
461      2049                  covered_navel    12752
462       881                      full_moon    12740
463      1529                         blouse    12646
464     51172                       cardigan    12591
465      4215                     on_stomach    12552
466      4107                         nature    12534
467     24856                     looking_up    12488
468      2941                          crown    12440
469      3955                          grass    12400
470        44                          cross    12386
471      2153                          broom    12336
472      1487                         no_hat    12319
473     20861                  skindentation    12308
474     84687                     bikini_top    12246
475       472                          towel    12208
476      1634                   dual_persona    12169
477      9680                         sheath    12149
478      3507                          plant    11982
479      1927                    twin_drills    11959
480     17892                      red_dress    11955
481      3683                         bottle    11865
482     36789                between_breasts    11851
483      1066                        loafers    11830
484       361                 ground_vehicle    11820
485      2008                       hair_bun    11817
486     12642                  low_twintails    11796
487     16151                          soles    11762
488      1857                  front-tie_top    11680
489     13999                     halterneck    11650
490     19574                           ring    11614
491     17803                   looking_down    11568
492      6547                           kiss    11562
493       636                   parted_bangs    11509
494     18715                  lavender_hair    11506
495     24967            heart-shaped_pupils    11472
496      2102                     demon_girl    11469
497    111143                        headset    11442
498      9579                       tank_top    11428
499      4014                 multiple_views    11353
500      6527                          tiara    11348
501     16056                    open_jacket    11278
502      7027                        bob_cut    11233
503      8160                         muscle    11220
504     21763                          phone    11189
505     24561               sleeveless_dress    11180
506      7923                     strap_slip    11158
507      2946                        hat_bow    11126
508      4948                      scrunchie    11121
509      1988                       fellatio    11119

And now let's build the training targets

Using some simple SQL queries.



In [8]:

    
def get_dbconnect_string(json_path, db_interface="postgresql+psycopg2"):
    with open(json_path, "rb") as fd:
        configuration = json.loads(fd.read())
    return db_interface + "://{user}:{password}@{host}:{port}/{database}".format(**configuration)
db_configuration = get_dbconnect_string("database.json")



In [9]:

    
%%sql $db_configuration
        SELECT MIN(id), MAX(id), COUNT(*) FROM posts









    



1 rows affected.






    Out[9]:





    
        min
        max
        count
    
    
        1
        2900000
        2867507



In [10]:

    
post_id_list = posts.id.tolist()
tag_id_list = tag_count_filtered.id.tolist()



In [11]:

    
# This took a very long time
tagged = %sql SELECT * FROM tagged WHERE tag_id = ANY(:tag_id_list) AND post_id = ANY(:post_id_list)









    



32776310 rows affected.



In [12]:

    
# And this took also quite long because my memory was full
tagged = np.fromiter(map(tuple, tagged), dtype=[("tag_id", np.int32), ("post_id", np.int32)], count=len(tagged))



In [13]:

    
tagged_series = pd.Series(tagged["tag_id"], index=tagged["post_id"])
tagged_series.sort_index(inplace=True)



In [14]:

    
tagged_series.to_hdf("metadata.h5", "tagged", mode="a", complevel=9, complib="bzip2")

Now we need targets with fixed dimension

Having the data is nice and all but we need it in the right format.



In [15]:

    
tagged = pd.read_hdf("metadata.h5", "tagged")



In [16]:

    
tagged_count = tagged.groupby(level=0).count()
tagged_descr = tagged_count.describe()
print(tagged_descr.to_string())









    



count    2.061472e+06
mean     1.589947e+01
std      7.888640e+00
min      1.000000e+00
25%      1.000000e+01
50%      1.500000e+01
75%      2.000000e+01
max      2.070000e+02



In [17]:

    
tagged_count.groupby(by=tagged_count).count().plot()
plt.xlim(tagged_descr.loc["min"], tagged_descr.loc["max"])
plt.axvspan(tagged_descr.loc["mean"] - tagged_descr.loc["std"], tagged_descr.loc["mean"] + tagged_descr.loc["std"], color="green", alpha=0.3)
plt.legend(["distribution", "standard derivation"])
plt.xlabel("Number of tags per image")
plt.ylabel("Number of images")
plt.title("Distribution of relevant tags per image")









    Out[17]:





Text(0.5,1,'Distribution of relevant tags per image')

This is a really interesting plot since the tag count distribution is more than obviously normal distributed and thus one could integrate over the plot to e.g. remove outliers. However, I don't deem this necessary here.



In [18]:

    
# This is for creating the target matrix
nr_posts = int(tagged_descr.loc["count"])
nr_tags = len(tag_count_filtered)
target = np.zeros((nr_posts, nr_tags), dtype=np.bool)
tag_ids = tag_count_filtered.sort_values("name").id.values
for i, post_id in enumerate(tagged_count.index):
    target[i] = np.isin(tag_ids, tagged.loc[post_id])



In [19]:

    
assert target.sum() == tagged.isin(tag_count_filtered.id).sum()



In [20]:

    
index = pd.MultiIndex.from_arrays(tag_count_filtered.sort_values("name").iloc[:,0:2].T.values,
                                  names=("id", "name"))
target_frame = pd.DataFrame(target, index=tagged_count.index, columns=index)



In [21]:

    
assert (target_frame.dtypes == np.bool).all()
assert target_frame.sum().sum() == tagged.isin(tag_count_filtered.id).sum()



In [22]:

    
target_frame.to_hdf("metadata.h5", "target", mode="a", complevel=9, complib="bzip2")

Manually remove some tags that are not obvious from the images

And now let's build the training targets

Now we need targets with fixed dimension

Done