In [1]:
%matplotlib inline
%load_ext sql
In [2]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [3]:
sns.set(rc={"figure.figsize": (10, 6)})
sns.set_style("whitegrid")
sns.set_context("notebook")
In [4]:
posts = pd.read_hdf("metadata.h5", "posts")
tag_count = pd.read_hdf("metadata.h5", "tag_count")
In [5]:
useless_tags = [
"lowres",
"highres",
"bad_id",
"bad_pixiv_id",
"monochrome",
"censored",
"alternate_costume",
"hetero",
"sketch",
"yuri",
"character_name",
"greyscale",
"artist_name",
"artist_request",
"artist_request",
"copyright_request",
"absurdres",
"dated",
"signature",
"cosplay",
"translated",
"copyright_name",
"traditional_media",
"twitter_username",
"alternate_hairstyle",
"mosaic_censoring",
"parody",
"english",
"gradient",
"couple",
"sisters",
"cover_page",
"crossover",
"uncensored",
"official_art",
"letterboxed",
"translation_request",
"scan",
"game_cg",
"remodel_(kantai_collection)",
"depth_of_field",
"convenient_censoring",
"foreshortening",
"watermark",
"genderswap",
"adapted_costume",
"pov",
"wallpaper",
"pokemon_(creature)",
"text",
"kemonomimi_mode",
"shinkaisei-kan",
"genderswap_(mtf)",
"personification",
"blurry",
"wind",
"younger",
"cover",
# Tags for the background
"white_background",
"grey_background",
"gradient_background",
"blue_background",
"pink_background",
# And this is about the "camera" position
"cowboy shot",
"dutch_angle",
"full_body",
"upper_body"
]
In [6]:
# The strange indices here are necessary because I want the 'kiss' tag that wouldn't be included otherwise.
tag_count_filtered = tag_count[tag_count.name.isin(useless_tags) == False].iloc[:509]
tag_count_filtered.index = pd.RangeIndex(1, len(tag_count_filtered) + 1)
# Also add the three ratings to the tags!
In [7]:
print(tag_count_filtered.to_string())
In [8]:
def get_dbconnect_string(json_path, db_interface="postgresql+psycopg2"):
with open(json_path, "rb") as fd:
configuration = json.loads(fd.read())
return db_interface + "://{user}:{password}@{host}:{port}/{database}".format(**configuration)
db_configuration = get_dbconnect_string("database.json")
In [9]:
%%sql $db_configuration
SELECT MIN(id), MAX(id), COUNT(*) FROM posts
Out[9]:
In [10]:
post_id_list = posts.id.tolist()
tag_id_list = tag_count_filtered.id.tolist()
In [11]:
# This took a very long time
tagged = %sql SELECT * FROM tagged WHERE tag_id = ANY(:tag_id_list) AND post_id = ANY(:post_id_list)
In [12]:
# And this took also quite long because my memory was full
tagged = np.fromiter(map(tuple, tagged), dtype=[("tag_id", np.int32), ("post_id", np.int32)], count=len(tagged))
In [13]:
tagged_series = pd.Series(tagged["tag_id"], index=tagged["post_id"])
tagged_series.sort_index(inplace=True)
In [14]:
tagged_series.to_hdf("metadata.h5", "tagged", mode="a", complevel=9, complib="bzip2")
In [15]:
tagged = pd.read_hdf("metadata.h5", "tagged")
In [16]:
tagged_count = tagged.groupby(level=0).count()
tagged_descr = tagged_count.describe()
print(tagged_descr.to_string())
In [17]:
tagged_count.groupby(by=tagged_count).count().plot()
plt.xlim(tagged_descr.loc["min"], tagged_descr.loc["max"])
plt.axvspan(tagged_descr.loc["mean"] - tagged_descr.loc["std"], tagged_descr.loc["mean"] + tagged_descr.loc["std"], color="green", alpha=0.3)
plt.legend(["distribution", "standard derivation"])
plt.xlabel("Number of tags per image")
plt.ylabel("Number of images")
plt.title("Distribution of relevant tags per image")
Out[17]:
This is a really interesting plot since the tag count distribution is more than obviously normal distributed and thus one could integrate over the plot to e.g. remove outliers. However, I don't deem this necessary here.
In [18]:
# This is for creating the target matrix
nr_posts = int(tagged_descr.loc["count"])
nr_tags = len(tag_count_filtered)
target = np.zeros((nr_posts, nr_tags), dtype=np.bool)
tag_ids = tag_count_filtered.sort_values("name").id.values
for i, post_id in enumerate(tagged_count.index):
target[i] = np.isin(tag_ids, tagged.loc[post_id])
In [19]:
assert target.sum() == tagged.isin(tag_count_filtered.id).sum()
In [20]:
index = pd.MultiIndex.from_arrays(tag_count_filtered.sort_values("name").iloc[:,0:2].T.values,
names=("id", "name"))
target_frame = pd.DataFrame(target, index=tagged_count.index, columns=index)
In [21]:
assert (target_frame.dtypes == np.bool).all()
assert target_frame.sum().sum() == tagged.isin(tag_count_filtered.id).sum()
In [22]:
target_frame.to_hdf("metadata.h5", "target", mode="a", complevel=9, complib="bzip2")