In [1]:
import math
import itertools
from collections import Counter
import numpy as np
import scipy as sp
from pymongo import collection
import matplotlib.pyplot as plt
%matplotlib inline
from src import *
from src.mongodb import *
from src.datasets import *
from src.experiments import *
from data import APPID_DICT
In [2]:
list(APPID_DICT.keys())
Out[2]:
In [9]:
list(LABELS)
Out[9]:
nbins and bin_factor to use in learn, etc.), andexperiments extension, specifically distributional_info and evenly_distribute_samples
In [4]:
# Connect to reviews collection
db = connect_to_db(host='localhost', port=37017)
In [15]:
def do_some_distributional_research(db: collection, game: str,
labels: list = LABELS,
partition: str = 'all'):
"""
Run the `distributional_info` function and then apply some
transformations, `nbins`/`bin_factor` values, etc., to the
results.
Generates distributional information for each combination
of label, number of bins, bin factor, and transformation.
:param db: MongoDB collection
:type db: collection
:param game: name of game
:type game: str
:param labels: list of labels
:type labels: list
:param partition: name of data partition (or 'all' to use all
data)
:type partition: str
:yields: tuple of dictionary containing label value
distribution information and a list of the original
label values
:ytype: tuple
"""
# Get distributional data for each label via the
# `distributional_info` function and make some plots, etc.
transformations = {'None': None,
'ln': lambda x: np.log(x) if x > 1 else 0.0,
'**5': lambda x: x**5.0,
'**2': lambda x: x**2.0,
'**0.5': lambda x: x**0.5,
'**0.25': lambda x: x**0.25}
nbins_values = [None, 2, 3, 4, 5]
bin_factor_values = [None, 0.25, 0.5, 5.0, 8.0, 10.0]
filtered_nbins_bin_factor_product = \
filter(lambda x: ((x[0] == None and x[1] == None)
or (x[0] != None)),
itertools.product(nbins_values, bin_factor_values))
transformations_dict = {transformation: {} for transformation
in transformations}
stats_dicts = {str(label): dict(transformations_dict)
for label in labels}
for label in labels:
# Get all raw label values and convert to floats
raw_label_values = \
(list(distributional_info(db,
label,
[game],
partition)
['id_strings_labels_dict'].values()))
raw_label_values = np.array([float(val) for val in raw_label_values])
raw_values_to_return = raw_label_values
# If the label has percentage values, i.e., values between
# 0.0 and 1.0 (inclusive), multiply the values by 100 before
# doing anything else
# Note: Define these specific labels somewhere!
if label in LABELS_WITH_PCT_VALUES:
raw_label_values *= 100.0
# Apply various types of transformations to the data and
# measure the normality of the resulting distribution, etc.
for transformation, transformer in transformations.items():
if transformer:
label_values = np.array([transformer(x)
for x in raw_label_values])
else:
label_values = np.array(raw_label_values)
# Apply various combinations of `nbins`/`bin_factor`
# values (including not specifying those values)
label_transformation_string = '{0}_{1}'.format(label, transformation)
for nbins, bin_factor in filtered_nbins_bin_factor_product:
nbins_bin_factor_string = '{0}_{1}'.format(nbins, bin_factor)
stats_dict = {}
# Don't bin the values if `nbins` and `bin_factor` are
# unspecified
if not nbins and not bin_factor:
pass
else:
# Get min/max values
_min = np.floor(label_values.min())
_max = np.ceil(label_values.max())
# If `bin_factor` is unspecified, use the default
# value, 1.0
bin_factor = bin_factor if bin_factor else 1.0
# Get bin range tuples and validate
try:
bin_ranges = get_bin_ranges(_min, _max, nbins,
bin_factor)
except ValueError as e:
print('Encountered invalid bin_ranges:\n\t'
'nbins: {0}\n\tbin_factor: {1}\n\tmin: '
'{2}\n\tmax: {3}\n\ttransformation: {4}'
'\n\tlabel: {5}'
.format(nbins, bin_factor, _min, _max,
transformation, label))
continue
# Convert raw values
stats_dict['bin_ranges'] = bin_ranges
label_values = np.array([get_bin(bin_ranges, val)
for val in label_values])
stats_dict['label_values'] = label_values
# Collect some stats and measurements
stats_dict.update({'min': label_values.min(),
'max': label_values.max(),
'std': label_values.std(),
'mean': label_values.mean(),
'median': np.median(label_values),
'mode': sp.stats.mode(label_values).mode[0],
'normaltest': sp.stats.normaltest(label_values)})
yield ({label_transformation_string: {nbins_bin_factor_string: stats_dict}},
raw_values_to_return)
In [ ]:
# Let's build up a dictionary of distributional information for each label and
# for each in a random subset of 3 games
# Execute a number of times until you get the subset you want
games_subset = list(np.random.choice([game for game in APPID_DICT
if not game.startswith('sample')],
3, replace=False))
dist_info_dict = {}
for game in games_subset:
try:
if dist_info_dict.get(game):
continue
dist_info_dict[game] = do_some_distributional_research(db, game)
except ValueError as e:
continue
In [ ]:
# Each game will have 21 different outputs, so let's break things up a bit
dist_info_dict_Arma_3 = dist_info_dict['Arma_3']
dist_info_dict_Team_Fortress_2 = dist_info_dict['Team_Fortress_2']
dist_info_dict_Counter_Strike = dist_info_dict['Counter_Strike']
In [13]:
Arma_3_stats_dicts_all_labels_all_data = do_some_distributional_research(db, 'Arma_3')
In [17]:
next(Arma_3_stats_dicts_all_labels_all_data)
Out[17]:
In [ ]:
dist_info_dict_Arma_3.keys()
In [ ]:
dist_info_dict_Arma_3['num_reviews']['labels_counter']
In [ ]:
# Use `get_bin_ranges` to determine the ranges of bins
num_reviews_Arma_3 = dist_info_dict_Arma_3['num_reviews']['labels_counter']
num_reviews_Arma_3_values = np.array(list(num_reviews_Arma_3.keys()))
num_reviews_Arma_3_min_value = num_reviews_Arma_3_values.min()
num_reviews_Arma_3_max_value = num_reviews_Arma_3_values.max()
num_reviews_Arma_3_bin_ranges_3_1 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=3,
factor=1.0)
num_reviews_Arma_3_bin_ranges_3_1_5 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=3,
factor=1.5)
num_reviews_Arma_3_bin_ranges_3_2 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=3,
factor=2.0)
num_reviews_Arma_3_bin_ranges_3_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=3,
factor=3.0)
num_reviews_Arma_3_bin_ranges_2_3 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=2,
factor=3.0)
num_reviews_Arma_3_bin_ranges_2_10 = get_bin_ranges(num_reviews_Arma_3_min_value,
num_reviews_Arma_3_max_value,
nbins=2,
factor=10.0)
print("bins = 3, bin_factor = 1.0: {}".format(num_reviews_Arma_3_bin_ranges_3_1))
print("bins = 3, bin_factor = 1.5: {}".format(num_reviews_Arma_3_bin_ranges_3_1_5))
print("bins = 3, bin_factor = 2.0: {}".format(num_reviews_Arma_3_bin_ranges_3_2))
print("bins = 3, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_3_3))
print("bins = 2, bin_factor = 3.0: {}".format(num_reviews_Arma_3_bin_ranges_2_3))
print("bins = 2, bin_factor = 10.0: {}".format(num_reviews_Arma_3_bin_ranges_2_10))
In [ ]:
num_reviews_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_reviews']['id_strings_labels_dict'].values())
In [ ]:
plt.hist(list(np.random.normal(200, 100, 1000)))
plt.title("Normal Distribution Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist(num_reviews_raw_label_values_Arma_3)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist(num_reviews_raw_label_values_Arma_3, normed=True)
plt.title("Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
normed=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log(x) for x in num_reviews_raw_label_values_Arma_3 if x != 0],
normed=True, cumulative=True)
plt.title("Log Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log2(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log2(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([np.log10(x + 1) for x in num_reviews_raw_label_values_Arma_3])
plt.title("Log10(x + 1) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3)
In [ ]:
plt.hist(sp.stats.mstats.zscore(num_reviews_raw_label_values_Arma_3))
plt.title("z-score num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([math.sqrt(x) for x in num_reviews_raw_label_values_Arma_3])
plt.title("sqrt(x) Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
plt.hist([x**2 for x in num_reviews_raw_label_values_Arma_3])
plt.title("x^2 Arma_3 num_reviews Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
dist_info_dict_Arma_3['total_game_hours_bin']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['total_game_hours']['labels_counter']
In [ ]:
total_game_hours_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['total_game_hours']['id_strings_labels_dict'].values())
In [ ]:
plt.hist([x**0.25 for x in total_game_hours_raw_label_values_Arma_3])
plt.title("Log x Arma_3 total_game_hours Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
dist_info_dict_Arma_3['total_game_hours_last_two_weeks']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_found_helpful']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_found_unhelpful']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['found_helpful_percentage']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_voted_helpfulness']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_achievements_attained']['labels_counter']
In [ ]:
num_achievements_attained_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_attained']['id_strings_labels_dict'].values())
In [ ]:
plt.hist([np.log(x) for x in num_achievements_attained_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_achievements_attained Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
dist_info_dict_Arma_3['num_achievements_percentage']['labels_counter']
In [ ]:
num_achievements_percentage_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_achievements_percentage']['id_strings_labels_dict'].values())
In [ ]:
plt.hist(num_achievements_percentage_raw_label_values_Arma_3)
plt.title("Arma_3 num_achievements_percentage Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
dist_info_dict_Arma_3['num_achievements_possible']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_guides']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_workshop_items']['labels_counter']
In [ ]:
num_friends_raw_label_values_Arma_3 = list(dist_info_dict_Arma_3['num_friends']['id_strings_labels_dict'].values())
In [ ]:
plt.hist([np.log(x) for x in num_friends_raw_label_values_Arma_3 if x != 0])
plt.title("Log Arma_3 num_friends Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
In [ ]:
dist_info_dict_Arma_3['num_games_owned']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_comments']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['friend_player_level']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_groups']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_screenshots']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_badges']['labels_counter']
In [ ]:
dist_info_dict_Arma_3['num_found_funny']['labels_counter']
In [ ]:
for label in dist_info_dict_Team_Fortress_2:
print("Label = {}\n".format(label))
print("{}\n".format(dist_info_dict_Team_Fortress_2[label]['labels_counter']))
In [ ]:
for label in dist_info_dict_Counter_Strike:
print("Label = {}\n".format(label))
print("{}\n".format(dist_info_dict_Counter_Strike[label]['labels_counter']))