Exploring Label Distribution and Converting Distribution to Bins of Values


In [1]:
import math
import itertools
from collections import Counter

import numpy as np
import scipy as sp
from pymongo import collection
import matplotlib.pyplot as plt
%matplotlib inline

from src import *
from src.mongodb import *
from src.datasets import *
from src.experiments import *
from data import APPID_DICT

Games


In [2]:
list(APPID_DICT.keys())


Out[2]:
['Garrys_Mod',
 'Counter_Strike',
 'Counter_Strike_Global_Offensive',
 'Football_Manager_2015',
 'Warframe',
 'Grand_Theft_Auto_V',
 'Team_Fortress_2',
 'The_Elder_Scrolls_V',
 'Dota_2',
 'Arma_3',
 'sample',
 'Sid_Meiers_Civilization_5']

Labels


In [9]:
list(LABELS)


Out[9]:
['total_game_hours_bin',
 'num_reviews',
 'found_helpful_percentage',
 'num_voted_helpfulness',
 'num_achievements_percentage',
 'num_games_owned',
 'num_workshop_items',
 'num_found_funny',
 'num_achievements_attained',
 'num_found_helpful',
 'num_guides',
 'num_friends',
 'num_screenshots',
 'num_found_unhelpful',
 'total_game_hours_last_two_weeks',
 'num_comments',
 'total_game_hours',
 'num_achievements_possible',
 'friend_player_level',
 'num_groups',
 'num_badges']

Issues

  • The main concern is that, in order to know whether it will be potentially interesting and worth exploration to do experiments with a certain label, it is necessary to know
    1. if it can be used as is (raw values), which is unlikely, and, if not,
    2. how its distribution can be carved up (specifically, what values for nbins and bin_factor to use in learn, etc.), and
    3. whether or not the current algorithm for deciding on the range of included values (i.e., excluding outliers) and making the value bins works or if it needs to be automated somehow (i.e., even potentially using some kind of cluster analysis, perhaps)

Proposed Plan of Action

  • Some of this information can be collected via functions in the experiments extension, specifically distributional_info and evenly_distribute_samples
  • Collect data on the distributions of all of the labels for a subset of games and explore the way that the values are distributed, considering alternate ways that the values could be clustered together

In [4]:
# Connect to reviews collection
db = connect_to_db(host='localhost', port=37017)

In [15]:
def do_some_distributional_research(db: collection, game: str,
                                    labels: list = LABELS,
                                    partition: str = 'all'):
    """
    Run the `distributional_info` function and then apply some
    transformations, `nbins`/`bin_factor` values, etc., to the
    results.

    Generates distributional information for each combination
    of label, number of bins, bin factor, and transformation.

    :param db: MongoDB collection
    :type db: collection
    :param game: name of game
    :type game: str
    :param labels: list of labels
    :type labels: list
    :param partition: name of data partition (or 'all' to use all
                      data)
    :type partition: str

    :yields: tuple of dictionary containing label value
             distribution information and a list of the original
             label values
    :ytype: tuple
    """

    # Get distributional data for each label via the
    # `distributional_info` function and make some plots, etc.
    transformations = {'None': None,
                       'ln': lambda x: np.log(x) if x > 1 else 0.0,
                       '**5': lambda x: x**5.0,
                       '**2': lambda x: x**2.0,
                       '**0.5': lambda x: x**0.5,
                       '**0.25': lambda x: x**0.25}
    nbins_values = [None, 2, 3, 4, 5]
    bin_factor_values = [None, 0.25, 0.5, 5.0, 8.0, 10.0]
    filtered_nbins_bin_factor_product = \
        filter(lambda x: ((x[0] == None and x[1] == None)
                          or (x[0] != None)),
               itertools.product(nbins_values, bin_factor_values))

    transformations_dict = {transformation: {} for transformation
                            in transformations}
    stats_dicts = {str(label): dict(transformations_dict)
                   for label in labels}
    for label in labels:

        # Get all raw label values and convert to floats
        raw_label_values = \
            (list(distributional_info(db,
                                      label,
                                      [game],
                                      partition)
                                      ['id_strings_labels_dict'].values()))
        raw_label_values = np.array([float(val) for val in raw_label_values])
        raw_values_to_return = raw_label_values

        # If the label has percentage values, i.e., values between
        # 0.0 and 1.0 (inclusive), multiply the values by 100 before
        # doing anything else
        # Note: Define these specific labels somewhere!
        if label in LABELS_WITH_PCT_VALUES:
            raw_label_values *= 100.0

        # Apply various types of transformations to the data and
        # measure the normality of the resulting distribution, etc.
        for transformation, transformer in transformations.items():
            if transformer:
                label_values = np.array([transformer(x)
                                         for x in raw_label_values])
            else:
                label_values = np.array(raw_label_values)

            # Apply various combinations of `nbins`/`bin_factor`
            # values (including not specifying those values)
            label_transformation_string = '{0}_{1}'.format(label, transformation)
            for nbins, bin_factor in filtered_nbins_bin_factor_product:
                nbins_bin_factor_string = '{0}_{1}'.format(nbins, bin_factor)
                stats_dict = {}

                # Don't bin the values if `nbins` and `bin_factor` are
                # unspecified
                if not nbins and not bin_factor:
                    pass
                else:

                    # Get min/max values
                    _min = np.floor(label_values.min())
                    _max = np.ceil(label_values.max())

                    # If `bin_factor` is unspecified, use the default
                    # value, 1.0
                    bin_factor = bin_factor if bin_factor else 1.0

                    # Get bin range tuples and validate
                    try:
                        bin_ranges = get_bin_ranges(_min, _max, nbins,
                                                    bin_factor)
                    except ValueError as e:
                        print('Encountered invalid bin_ranges:\n\t'
                              'nbins: {0}\n\tbin_factor: {1}\n\tmin: '
                              '{2}\n\tmax: {3}\n\ttransformation: {4}'
                              '\n\tlabel: {5}'
                              .format(nbins, bin_factor, _min, _max,
                                      transformation, label))
                        continue

                    # Convert raw values
                    stats_dict['bin_ranges'] = bin_ranges
                    label_values = np.array([get_bin(bin_ranges, val)
                                             for val in label_values])
                    stats_dict['label_values'] = label_values

                # Collect some stats and measurements
                stats_dict.update({'min': label_values.min(),
                                   'max': label_values.max(),
                                   'std': label_values.std(),
                                   'mean': label_values.mean(),
                                   'median': np.median(label_values),
                                   'mode': sp.stats.mode(label_values).mode[0],
                                   'normaltest': sp.stats.normaltest(label_values)})

            yield ({label_transformation_string: {nbins_bin_factor_string: stats_dict}},
                   raw_values_to_return)

In [ ]:
# Let's build up a dictionary of distributional information for each label and
# for each in a random subset of 3 games
# Execute a number of times until you get the subset you want
games_subset = list(np.random.choice([game for game in APPID_DICT
                                      if not game.startswith('sample')],
                                      3, replace=False))
dist_info_dict = {}
for game in games_subset:
    try:
        if dist_info_dict.get(game):
            continue
        dist_info_dict[game] = do_some_distributional_research(db, game)
    except ValueError as e:
        continue

In [ ]:
# Each game will have 21 different outputs, so let's break things up a bit
dist_info_dict_Arma_3 = dist_info_dict['Arma_3']
dist_info_dict_Team_Fortress_2 = dist_info_dict['Team_Fortress_2']
dist_info_dict_Counter_Strike = dist_info_dict['Counter_Strike']

In [13]:
Arma_3_stats_dicts_all_labels_all_data = do_some_distributional_research(db, 'Arma_3')

In [17]:
next(Arma_3_stats_dicts_all_labels_all_data)


Out[17]:
({'total_game_hours_bin_None': {'5_10.0': {}}},
 array([ 2.,  1.,  3., ...,  1.,  1.,  2.]))

Examining the Distribution of Labels for Arma 3


In [ ]:
dist_info_dict_Arma_3.keys()

num_reviews


In [ ]:
dist_info_dict_Arma_3['num_reviews']['labels_counter']

In [ ]:
# Use `get_bin_ranges` to determine the ranges of bins
num_reviews_Arma_3 = dist_info_dict_Arma_3['num_reviews']['labels_counter']
num_reviews_Arma_3_values = np.array(list(num_reviews_Arma_3.keys()))
num_reviews_Arma_3_min_value = num_reviews_Arma_3_values.min()
num_reviews_Arma_3_max_value = num_reviews_Arma_3_values.max()
num_reviews_Arma_3_bin_ranges_3_1 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=1.0)
num_reviews_Arma_3_bin_ranges_3_1_5 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                     num_reviews_Arma_3_max_value,
                                                     nbins=3,
                                                     factor=1.5)
num_reviews_Arma_3_bin_ranges_3_2 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=2.0)
num_reviews_Arma_3_bin_ranges_3_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=3.0)
num_reviews_Arma_3_bin_ranges_2_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=2,
                                                   factor=3.0)
num_reviews_Arma_3_bin_ranges_2_10 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=2,
                                                   factor=10.0)
print("bins = 3, bin_factor = 1.0: {}".format(num_reviews_Arma_3_bin_ranges_3_1))
print("bins = 3, bin_factor = 1.5: {}".format(num_reviews_Arma_3_bin_ranges_3_1_5))
print("bins = 3, bin_factor = 2.0: {}".format(num_reviews_Arma_3_bin_ranges_3_2))
print("bins = 3, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_3_3))
print("bins = 2, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_2_3))
print("bins = 2, bin_factor = 10.0: {}".format(num_reviews_Arma_3_bin_ranges_2_10))

In [ ]:
num_reviews_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_reviews']['id_strings_labels_dict'].values())

In [ ]:
plt.hist(list(np.random.normal(200, 100, 1000)))
plt.title("Normal Distribution Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist(num_reviews_raw_label_values_Arma_3)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist(num_reviews_raw_label_values_Arma_3, normed=True)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
         normed=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
         normed=True, cumulative=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log2(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log2(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([np.log10(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log10(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3)

In [ ]:
plt.hist(sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3))
plt.title("z-score num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([math.sqrt(x) for x in num_reviews_raw_label_values_Arma_3])
plt.title("sqrt(x) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

In [ ]:
plt.hist([x**2 for x in num_reviews_raw_label_values_Arma_3])
plt.title("x^2 Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

total_game_hours_bin


In [ ]:
dist_info_dict_Arma_3['total_game_hours_bin']['labels_counter']

total_game_hours


In [ ]:
dist_info_dict_Arma_3['total_game_hours']['labels_counter']

In [ ]:
total_game_hours_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['total_game_hours']['id_strings_labels_dict'].values())

In [ ]:
plt.hist([x**0.25 for x in total_game_hours_raw_label_values_Arma_3])
plt.title("Log x Arma_3 total_game_hours Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

total_game_hours_last_two_weeks


In [ ]:
dist_info_dict_Arma_3['total_game_hours_last_two_weeks']['labels_counter']

num_found_helpful


In [ ]:
dist_info_dict_Arma_3['num_found_helpful']['labels_counter']

num_found_unhelpful


In [ ]:
dist_info_dict_Arma_3['num_found_unhelpful']['labels_counter']

found_helpful_percentage


In [ ]:
dist_info_dict_Arma_3['found_helpful_percentage']['labels_counter']

num_voted_helpfulness


In [ ]:
dist_info_dict_Arma_3['num_voted_helpfulness']['labels_counter']

num_achievements_attained


In [ ]:
dist_info_dict_Arma_3['num_achievements_attained']['labels_counter']

In [ ]:
num_achievements_attained_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_attained']['id_strings_labels_dict'].values())

In [ ]:
plt.hist([np.log(x) for x in num_achievements_attained_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_achievements_attained Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_achievements_percentage


In [ ]:
dist_info_dict_Arma_3['num_achievements_percentage']['labels_counter']

In [ ]:
num_achievements_percentage_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_percentage']['id_strings_labels_dict'].values())

In [ ]:
plt.hist(num_achievements_percentage_raw_label_values_Arma_3)
plt.title("Arma_3 num_achievements_percentage Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_achievements_possible


In [ ]:
dist_info_dict_Arma_3['num_achievements_possible']['labels_counter']

num_guides


In [ ]:
dist_info_dict_Arma_3['num_guides']['labels_counter']

num_workshop_items


In [ ]:
dist_info_dict_Arma_3['num_workshop_items']['labels_counter']

num_friends


In [ ]:
num_friends_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_friends']['id_strings_labels_dict'].values())

In [ ]:
plt.hist([np.log(x) for x in num_friends_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_friends Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_games_owned


In [ ]:
dist_info_dict_Arma_3['num_games_owned']['labels_counter']

num_comments


In [ ]:
dist_info_dict_Arma_3['num_comments']['labels_counter']

friend_player_level


In [ ]:
dist_info_dict_Arma_3['friend_player_level']['labels_counter']

num_groups


In [ ]:
dist_info_dict_Arma_3['num_groups']['labels_counter']

num_screenshots


In [ ]:
dist_info_dict_Arma_3['num_screenshots']['labels_counter']

num_badges


In [ ]:
dist_info_dict_Arma_3['num_badges']['labels_counter']

num_found_funny


In [ ]:
dist_info_dict_Arma_3['num_found_funny']['labels_counter']

Examining the Distribution of Labels for Team Fortress 2


In [ ]:
for label in dist_info_dict_Team_Fortress_2:
    print("Label = {}\n".format(label))
    print("{}\n".format(dist_info_dict_Team_Fortress_2[label]['labels_counter']))

Examining the Distribution of Labels for Counter Strike


In [ ]:
for label in dist_info_dict_Counter_Strike:
    print("Label = {}\n".format(label))
    print("{}\n".format(dist_info_dict_Counter_Strike[label]['labels_counter']))