Exploring Label Distribution and Converting Distribution to Bins of Values



In [1]:

    
import math
import itertools
from collections import Counter

import numpy as np
import scipy as sp
from pymongo import collection
import matplotlib.pyplot as plt
%matplotlib inline

from src import *
from src.mongodb import *
from src.datasets import *
from src.experiments import *
from data import APPID_DICT

Games



In [2]:

    
list(APPID_DICT.keys())









    Out[2]:





['Garrys_Mod',
 'Counter_Strike',
 'Counter_Strike_Global_Offensive',
 'Football_Manager_2015',
 'Warframe',
 'Grand_Theft_Auto_V',
 'Team_Fortress_2',
 'The_Elder_Scrolls_V',
 'Dota_2',
 'Arma_3',
 'sample',
 'Sid_Meiers_Civilization_5']

Labels



In [9]:

    
list(LABELS)









    Out[9]:





['total_game_hours_bin',
 'num_reviews',
 'found_helpful_percentage',
 'num_voted_helpfulness',
 'num_achievements_percentage',
 'num_games_owned',
 'num_workshop_items',
 'num_found_funny',
 'num_achievements_attained',
 'num_found_helpful',
 'num_guides',
 'num_friends',
 'num_screenshots',
 'num_found_unhelpful',
 'total_game_hours_last_two_weeks',
 'num_comments',
 'total_game_hours',
 'num_achievements_possible',
 'friend_player_level',
 'num_groups',
 'num_badges']

Issues

The main concern is that, in order to know whether it will be potentially interesting and worth exploration to do experiments with a certain label, it is necessary to know
1. if it can be used as is (raw values), which is unlikely, and, if not,
2. how its distribution can be carved up (specifically, what values for nbins and bin_factor to use in learn, etc.), and
3. whether or not the current algorithm for deciding on the range of included values (i.e., excluding outliers) and making the value bins works or if it needs to be automated somehow (i.e., even potentially using some kind of cluster analysis, perhaps)

Proposed Plan of Action

Some of this information can be collected via functions in the experiments extension, specifically distributional_info and evenly_distribute_samples
Collect data on the distributions of all of the labels for a subset of games and explore the way that the values are distributed, considering alternate ways that the values could be clustered together



In [4]:

    
# Connect to reviews collection
db = connect_to_db(host='localhost', port=37017)



In [15]:

    
def do_some_distributional_research(db: collection, game: str,
                                    labels: list = LABELS,
                                    partition: str = 'all'):
    """
    Run the `distributional_info` function and then apply some
    transformations, `nbins`/`bin_factor` values, etc., to the
    results.

    Generates distributional information for each combination
    of label, number of bins, bin factor, and transformation.

    :param db: MongoDB collection
    :type db: collection
    :param game: name of game
    :type game: str
    :param labels: list of labels
    :type labels: list
    :param partition: name of data partition (or 'all' to use all
                      data)
    :type partition: str

    :yields: tuple of dictionary containing label value
             distribution information and a list of the original
             label values
    :ytype: tuple
    """

    # Get distributional data for each label via the
    # `distributional_info` function and make some plots, etc.
    transformations = {'None': None,
                       'ln': lambda x: np.log(x) if x > 1 else 0.0,
                       '**5': lambda x: x**5.0,
                       '**2': lambda x: x**2.0,
                       '**0.5': lambda x: x**0.5,
                       '**0.25': lambda x: x**0.25}
    nbins_values = [None, 2, 3, 4, 5]
    bin_factor_values = [None, 0.25, 0.5, 5.0, 8.0, 10.0]
    filtered_nbins_bin_factor_product = \
        filter(lambda x: ((x[0] == None and x[1] == None)
                          or (x[0] != None)),
               itertools.product(nbins_values, bin_factor_values))

    transformations_dict = {transformation: {} for transformation
                            in transformations}
    stats_dicts = {str(label): dict(transformations_dict)
                   for label in labels}
    for label in labels:

        # Get all raw label values and convert to floats
        raw_label_values = \
            (list(distributional_info(db,
                                      label,
                                      [game],
                                      partition)
                                      ['id_strings_labels_dict'].values()))
        raw_label_values = np.array([float(val) for val in raw_label_values])
        raw_values_to_return = raw_label_values

        # If the label has percentage values, i.e., values between
        # 0.0 and 1.0 (inclusive), multiply the values by 100 before
        # doing anything else
        # Note: Define these specific labels somewhere!
        if label in LABELS_WITH_PCT_VALUES:
            raw_label_values *= 100.0

        # Apply various types of transformations to the data and
        # measure the normality of the resulting distribution, etc.
        for transformation, transformer in transformations.items():
            if transformer:
                label_values = np.array([transformer(x)
                                         for x in raw_label_values])
            else:
                label_values = np.array(raw_label_values)

            # Apply various combinations of `nbins`/`bin_factor`
            # values (including not specifying those values)
            label_transformation_string = '{0}_{1}'.format(label, transformation)
            for nbins, bin_factor in filtered_nbins_bin_factor_product:
                nbins_bin_factor_string = '{0}_{1}'.format(nbins, bin_factor)
                stats_dict = {}

                # Don't bin the values if `nbins` and `bin_factor` are
                # unspecified
                if not nbins and not bin_factor:
                    pass
                else:

                    # Get min/max values
                    _min = np.floor(label_values.min())
                    _max = np.ceil(label_values.max())

                    # If `bin_factor` is unspecified, use the default
                    # value, 1.0
                    bin_factor = bin_factor if bin_factor else 1.0

                    # Get bin range tuples and validate
                    try:
                        bin_ranges = get_bin_ranges(_min, _max, nbins,
                                                    bin_factor)
                    except ValueError as e:
                        print('Encountered invalid bin_ranges:\n\t'
                              'nbins: {0}\n\tbin_factor: {1}\n\tmin: '
                              '{2}\n\tmax: {3}\n\ttransformation: {4}'
                              '\n\tlabel: {5}'
                              .format(nbins, bin_factor, _min, _max,
                                      transformation, label))
                        continue

                    # Convert raw values
                    stats_dict['bin_ranges'] = bin_ranges
                    label_values = np.array([get_bin(bin_ranges, val)
                                             for val in label_values])
                    stats_dict['label_values'] = label_values

                # Collect some stats and measurements
                stats_dict.update({'min': label_values.min(),
                                   'max': label_values.max(),
                                   'std': label_values.std(),
                                   'mean': label_values.mean(),
                                   'median': np.median(label_values),
                                   'mode': sp.stats.mode(label_values).mode[0],
                                   'normaltest': sp.stats.normaltest(label_values)})

            yield ({label_transformation_string: {nbins_bin_factor_string: stats_dict}},
                   raw_values_to_return)



In [ ]:

    
# Let's build up a dictionary of distributional information for each label and
# for each in a random subset of 3 games
# Execute a number of times until you get the subset you want
games_subset = list(np.random.choice([game for game in APPID_DICT
                                      if not game.startswith('sample')],
                                      3, replace=False))
dist_info_dict = {}
for game in games_subset:
    try:
        if dist_info_dict.get(game):
            continue
        dist_info_dict[game] = do_some_distributional_research(db, game)
    except ValueError as e:
        continue



In [ ]:

    
# Each game will have 21 different outputs, so let's break things up a bit
dist_info_dict_Arma_3 = dist_info_dict['Arma_3']
dist_info_dict_Team_Fortress_2 = dist_info_dict['Team_Fortress_2']
dist_info_dict_Counter_Strike = dist_info_dict['Counter_Strike']



In [13]:

    
Arma_3_stats_dicts_all_labels_all_data = do_some_distributional_research(db, 'Arma_3')



In [17]:

    
next(Arma_3_stats_dicts_all_labels_all_data)









    Out[17]:





({'total_game_hours_bin_None': {'5_10.0': {}}},
 array([ 2.,  1.,  3., ...,  1.,  1.,  2.]))

Examining the Distribution of Labels for Arma 3



In [ ]:

    
dist_info_dict_Arma_3.keys()

num_reviews



In [ ]:

    
dist_info_dict_Arma_3['num_reviews']['labels_counter']



In [ ]:

    
# Use `get_bin_ranges` to determine the ranges of bins
num_reviews_Arma_3 = dist_info_dict_Arma_3['num_reviews']['labels_counter']
num_reviews_Arma_3_values = np.array(list(num_reviews_Arma_3.keys()))
num_reviews_Arma_3_min_value = num_reviews_Arma_3_values.min()
num_reviews_Arma_3_max_value = num_reviews_Arma_3_values.max()
num_reviews_Arma_3_bin_ranges_3_1 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=1.0)
num_reviews_Arma_3_bin_ranges_3_1_5 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                     num_reviews_Arma_3_max_value,
                                                     nbins=3,
                                                     factor=1.5)
num_reviews_Arma_3_bin_ranges_3_2 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=2.0)
num_reviews_Arma_3_bin_ranges_3_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=3,
                                                   factor=3.0)
num_reviews_Arma_3_bin_ranges_2_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=2,
                                                   factor=3.0)
num_reviews_Arma_3_bin_ranges_2_10 = get_bin_ranges(num_reviews_Arma_3_min_value,
                                                   num_reviews_Arma_3_max_value,
                                                   nbins=2,
                                                   factor=10.0)
print("bins = 3, bin_factor = 1.0: {}".format(num_reviews_Arma_3_bin_ranges_3_1))
print("bins = 3, bin_factor = 1.5: {}".format(num_reviews_Arma_3_bin_ranges_3_1_5))
print("bins = 3, bin_factor = 2.0: {}".format(num_reviews_Arma_3_bin_ranges_3_2))
print("bins = 3, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_3_3))
print("bins = 2, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_2_3))
print("bins = 2, bin_factor = 10.0: {}".format(num_reviews_Arma_3_bin_ranges_2_10))



In [ ]:

    
num_reviews_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_reviews']['id_strings_labels_dict'].values())



In [ ]:

    
plt.hist(list(np.random.normal(200, 100, 1000)))
plt.title("Normal Distribution Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist(num_reviews_raw_label_values_Arma_3)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist(num_reviews_raw_label_values_Arma_3, normed=True)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
         normed=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
         normed=True, cumulative=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log2(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log2(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([np.log10(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log10(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3)



In [ ]:

    
plt.hist(sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3))
plt.title("z-score num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([math.sqrt(x) for x in num_reviews_raw_label_values_Arma_3])
plt.title("sqrt(x) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")



In [ ]:

    
plt.hist([x**2 for x in num_reviews_raw_label_values_Arma_3])
plt.title("x^2 Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

total_game_hours_bin



In [ ]:

    
dist_info_dict_Arma_3['total_game_hours_bin']['labels_counter']

total_game_hours



In [ ]:

    
dist_info_dict_Arma_3['total_game_hours']['labels_counter']



In [ ]:

    
total_game_hours_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['total_game_hours']['id_strings_labels_dict'].values())



In [ ]:

    
plt.hist([x**0.25 for x in total_game_hours_raw_label_values_Arma_3])
plt.title("Log x Arma_3 total_game_hours Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

total_game_hours_last_two_weeks



In [ ]:

    
dist_info_dict_Arma_3['total_game_hours_last_two_weeks']['labels_counter']

num_found_helpful



In [ ]:

    
dist_info_dict_Arma_3['num_found_helpful']['labels_counter']

num_found_unhelpful



In [ ]:

    
dist_info_dict_Arma_3['num_found_unhelpful']['labels_counter']

found_helpful_percentage



In [ ]:

    
dist_info_dict_Arma_3['found_helpful_percentage']['labels_counter']

num_voted_helpfulness



In [ ]:

    
dist_info_dict_Arma_3['num_voted_helpfulness']['labels_counter']

num_achievements_attained



In [ ]:

    
dist_info_dict_Arma_3['num_achievements_attained']['labels_counter']



In [ ]:

    
num_achievements_attained_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_attained']['id_strings_labels_dict'].values())



In [ ]:

    
plt.hist([np.log(x) for x in num_achievements_attained_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_achievements_attained Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_achievements_percentage



In [ ]:

    
dist_info_dict_Arma_3['num_achievements_percentage']['labels_counter']



In [ ]:

    
num_achievements_percentage_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_percentage']['id_strings_labels_dict'].values())



In [ ]:

    
plt.hist(num_achievements_percentage_raw_label_values_Arma_3)
plt.title("Arma_3 num_achievements_percentage Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_achievements_possible



In [ ]:

    
dist_info_dict_Arma_3['num_achievements_possible']['labels_counter']

num_guides



In [ ]:

    
dist_info_dict_Arma_3['num_guides']['labels_counter']

num_workshop_items



In [ ]:

    
dist_info_dict_Arma_3['num_workshop_items']['labels_counter']

num_friends



In [ ]:

    
num_friends_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_friends']['id_strings_labels_dict'].values())



In [ ]:

    
plt.hist([np.log(x) for x in num_friends_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_friends Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

num_games_owned



In [ ]:

    
dist_info_dict_Arma_3['num_games_owned']['labels_counter']

num_comments



In [ ]:

    
dist_info_dict_Arma_3['num_comments']['labels_counter']

friend_player_level



In [ ]:

    
dist_info_dict_Arma_3['friend_player_level']['labels_counter']

num_groups



In [ ]:

    
dist_info_dict_Arma_3['num_groups']['labels_counter']

num_screenshots



In [ ]:

    
dist_info_dict_Arma_3['num_screenshots']['labels_counter']

num_badges



In [ ]:

    
dist_info_dict_Arma_3['num_badges']['labels_counter']

num_found_funny



In [ ]:

    
dist_info_dict_Arma_3['num_found_funny']['labels_counter']

Examining the Distribution of Labels for Team Fortress 2



In [ ]:

    
for label in dist_info_dict_Team_Fortress_2:
    print("Label = {}\n".format(label))
    print("{}\n".format(dist_info_dict_Team_Fortress_2[label]['labels_counter']))

Examining the Distribution of Labels for Counter Strike



In [ ]:

    
for label in dist_info_dict_Counter_Strike:
    print("Label = {}\n".format(label))
    print("{}\n".format(dist_info_dict_Counter_Strike[label]['labels_counter']))