First, we load the AVA dataset, as provided on the website.


In [1]:
import os
import pandas
import numpy as np

AVA_PATH = 'data_static/AVA_dataset'

def load_ava_df():
    """
    Load the whole AVA dataset as a DataFrame, with columns
        image_id: int, ratings: list of ints,
        semantic_tag_X_id:int , semantic_tag_X_name:string (for X in [1, 2]),
        challenge_id: int, challenge_name: string.

    Ex. Get ratings for all images with tag of 'Macro':

    >>> ind = (df['semantic_tag_1_name'] == 'Macro') | \
              (df['semantic_tag_2_name'] == 'Macro')
    >>> X = np.vstack(df[ind]['ratings'])
    >>> X.shape
    (19171, 10)
    >>> X.dtype
    dtype('int64')

    Returns
    -------
    df: pandas.DataFrame
    """
    def load_ids_and_names(filename, column_name):
        with open(filename, 'r') as f:
            lines = f.readlines()
        # example of an (id, name) line: "37 Diptych / Triptych"
        data = [(int(line.split()[0]), ' '.join(line.split()[1:])) for line in lines]
        ids, names = zip(*data)
        df = pandas.DataFrame(
            data=list(names), index=list(ids),
            columns=[column_name], dtype=str)
        return df

    # Load the tag and challenge id-name mapping.
    tags_df = load_ids_and_names(AVA_PATH + '/tags.txt', 'semantic_tag_name')
    challenges_df = load_ids_and_names(AVA_PATH + '/challenges.txt', 'challenge_name')

    # Load the main data.
    X = np.genfromtxt(AVA_PATH + '/AVA.txt', dtype=int).T
    image_id = X[1]
    ratings = X[2:12].T
    mean_rating = (
        np.arange(1, 11.) * ratings / ratings.sum(1)[:, np.newaxis]).sum(1)
    df = pandas.DataFrame({
        'ratings': [row for row in ratings], 'mean_rating': mean_rating,
        'semantic_tag_1_id': X[12], 'semantic_tag_2_id': X[13],
        'challenge_id': X[14]
    }, index=image_id)

    # Store the names of the tags and challenges along with the ids.
    df['semantic_tag_1_name'] = df.join(tags_df, on='semantic_tag_1_id', how='left')['semantic_tag_name']
    df['semantic_tag_2_name'] = df.join(tags_df, on='semantic_tag_2_id', how='left')['semantic_tag_name']
    df = df.join(challenges_df, on='challenge_id', how='left')

    return df

df = load_ava_df()

In [2]:
print(df.iloc[:10])


        challenge_id  mean_rating                               ratings  \
953619          1396     5.637097    [0, 1, 5, 17, 38, 36, 15, 6, 5, 1]   
953958          1396     4.698413  [10, 7, 15, 26, 26, 21, 10, 8, 1, 2]   
954184          1396     5.674603     [0, 0, 4, 8, 41, 56, 10, 3, 4, 0]   
954113          1396     5.773438     [0, 1, 4, 6, 48, 37, 23, 5, 2, 2]   
953980          1396     5.209302     [0, 3, 6, 15, 57, 39, 6, 1, 1, 1]   
954175          1396     5.600000    [0, 0, 5, 13, 40, 53, 14, 1, 3, 1]   
953349          1396     6.101562    [1, 1, 1, 7, 27, 46, 28, 13, 4, 0]   
953645          1396     6.007874     [0, 0, 0, 8, 33, 51, 27, 3, 3, 2]   
953897          1396     6.523438    [0, 0, 0, 5, 19, 46, 29, 22, 5, 2]   
953841          1396     5.984496     [0, 0, 3, 8, 37, 44, 22, 9, 4, 2]   

        semantic_tag_1_id  semantic_tag_2_id semantic_tag_1_name  \
953619                  1                 22            Abstract   
953958                  1                 21            Abstract   
954184                  0                  0                 NaN   
954113                 15                 21              Nature   
953980                 22                 38               Macro   
954175                 15                 65              Nature   
953349                 16                 21              Candid   
953645                  0                  0                 NaN   
953897                  7                 14                 Sky   
953841                 14                 53           Landscape   

             semantic_tag_2_name challenge_name  
953619                     Macro     100_Meters  
953958           Black and White     100_Meters  
954184                       NaN     100_Meters  
954113           Black and White     100_Meters  
953980                    Floral     100_Meters  
954175              Insects, etc     100_Meters  
953349           Black and White     100_Meters  
953645                       NaN     100_Meters  
953897                 Landscape     100_Meters  
953841  High Dynamic Range (HDR)     100_Meters  

The tag frequency plot agrees with the paper.


In [3]:
semantic_tags = set(df['semantic_tag_1_name']).union(set(df['semantic_tag_2_name']))
tag_frequencies = [df[(df['semantic_tag_1_name'] == tag) | (df['semantic_tag_2_name'] == tag)].shape[0] for tag in semantic_tags]
tag_df = pandas.DataFrame(tag_frequencies, index=semantic_tags, columns=['frequency'])
tag_df = tag_df.sort(column='frequency', ascending=False)
tag_df.iloc[:30].plot(figsize=(8, 4), kind='bar')


/Users/karayev/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:3095: FutureWarning: column is deprecated, use columns
  warnings.warn("column is deprecated, use columns", FutureWarning)
Out[3]:
<matplotlib.axes.AxesSubplot at 0x11b6bf610>

But the delta stuff doesn't. Here's the histogram of mean ratings.


In [4]:
_ = hist(df.mean_rating, bins=20)
df.mean_rating.mean()


Out[4]:
5.3833261234315826

And here is the plot of number of images at least delta away from 5, vs delta.


In [5]:
deltas = [0, .5, 1, 1.5, 2]
num_images = [df[(df['mean_rating'] <= 5 - delta) | (df['mean_rating'] >= 5 + delta)].shape[0] for delta in deltas]
plot(deltas, num_images, 's--k')
xlabel('delta')
ylabel('# images in dataset')


Out[5]:
<matplotlib.text.Text at 0x11b784750>