First, we load the AVA dataset, as provided on the website.
In [1]:
import os
import pandas
import numpy as np
AVA_PATH = 'data_static/AVA_dataset'
def load_ava_df():
"""
Load the whole AVA dataset as a DataFrame, with columns
image_id: int, ratings: list of ints,
semantic_tag_X_id:int , semantic_tag_X_name:string (for X in [1, 2]),
challenge_id: int, challenge_name: string.
Ex. Get ratings for all images with tag of 'Macro':
>>> ind = (df['semantic_tag_1_name'] == 'Macro') | \
(df['semantic_tag_2_name'] == 'Macro')
>>> X = np.vstack(df[ind]['ratings'])
>>> X.shape
(19171, 10)
>>> X.dtype
dtype('int64')
Returns
-------
df: pandas.DataFrame
"""
def load_ids_and_names(filename, column_name):
with open(filename, 'r') as f:
lines = f.readlines()
# example of an (id, name) line: "37 Diptych / Triptych"
data = [(int(line.split()[0]), ' '.join(line.split()[1:])) for line in lines]
ids, names = zip(*data)
df = pandas.DataFrame(
data=list(names), index=list(ids),
columns=[column_name], dtype=str)
return df
# Load the tag and challenge id-name mapping.
tags_df = load_ids_and_names(AVA_PATH + '/tags.txt', 'semantic_tag_name')
challenges_df = load_ids_and_names(AVA_PATH + '/challenges.txt', 'challenge_name')
# Load the main data.
X = np.genfromtxt(AVA_PATH + '/AVA.txt', dtype=int).T
image_id = X[1]
ratings = X[2:12].T
mean_rating = (
np.arange(1, 11.) * ratings / ratings.sum(1)[:, np.newaxis]).sum(1)
df = pandas.DataFrame({
'ratings': [row for row in ratings], 'mean_rating': mean_rating,
'semantic_tag_1_id': X[12], 'semantic_tag_2_id': X[13],
'challenge_id': X[14]
}, index=image_id)
# Store the names of the tags and challenges along with the ids.
df['semantic_tag_1_name'] = df.join(tags_df, on='semantic_tag_1_id', how='left')['semantic_tag_name']
df['semantic_tag_2_name'] = df.join(tags_df, on='semantic_tag_2_id', how='left')['semantic_tag_name']
df = df.join(challenges_df, on='challenge_id', how='left')
return df
df = load_ava_df()
In [2]:
print(df.iloc[:10])
The tag frequency plot agrees with the paper.
In [3]:
semantic_tags = set(df['semantic_tag_1_name']).union(set(df['semantic_tag_2_name']))
tag_frequencies = [df[(df['semantic_tag_1_name'] == tag) | (df['semantic_tag_2_name'] == tag)].shape[0] for tag in semantic_tags]
tag_df = pandas.DataFrame(tag_frequencies, index=semantic_tags, columns=['frequency'])
tag_df = tag_df.sort(column='frequency', ascending=False)
tag_df.iloc[:30].plot(figsize=(8, 4), kind='bar')
Out[3]:
But the delta stuff doesn't. Here's the histogram of mean ratings.
In [4]:
_ = hist(df.mean_rating, bins=20)
df.mean_rating.mean()
Out[4]:
And here is the plot of number of images at least delta away from 5, vs delta.
In [5]:
deltas = [0, .5, 1, 1.5, 2]
num_images = [df[(df['mean_rating'] <= 5 - delta) | (df['mean_rating'] >= 5 + delta)].shape[0] for delta in deltas]
plot(deltas, num_images, 's--k')
xlabel('delta')
ylabel('# images in dataset')
Out[5]: