In [52]:
import dataset
import pandas as pd
import numpy as np
import os
import seaborn as sns
import scipy
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [3]:
db = dataset.connect(os.environ['POSTGRES_URI'])
games_df = pd.read_sql('SELECT * FROM game_crawl_view;', os.environ['POSTGRES_URI'], index_col='steam_app_id',
parse_dates=['release_date', 'crawl_time'])
games_df.head()
Out[3]:
How many apps did we crawl in total?
In [4]:
len(games_df)
Out[4]:
And how many of these did we actually get info for?
In [5]:
games_df = games_df[~pd.isnull(games_df['game_name'])]
len(games_df)
Out[5]:
How many games vs DLC?
In [6]:
sns.countplot(x='is_dlc', data=games_df)
Out[6]:
Looking at only games from now on...
In [7]:
games_df = games_df[games_df['is_dlc'] == False]
Counts by various columns
In [8]:
games_df['publisher'].value_counts()
Out[8]:
In [9]:
games_df['developer'].value_counts()
Out[9]:
Plotting various review metrics
In [45]:
sns.distplot(games_df['metacritic_score'].dropna(), bins=25)
Out[45]:
In [46]:
sns.distplot(games_df['pct_positive_reviews_all_time'].dropna(), bins=25)
Out[46]:
In [14]:
sns.distplot(games_df['pct_positive_reviews_last_30_days'].dropna())
Out[14]:
In [29]:
nonnull_reviews_df = games_df[
~pd.isnull(games_df['metacritic_score'])
& ~pd.isnull(games_df['pct_positive_reviews_all_time'])
]
sns.jointplot(x='metacritic_score', y='pct_positive_reviews_all_time', data=nonnull_reviews_df,
stat_func=scipy.stats.spearmanr)
Out[29]:
Some of those scores are really low... curious what the games are
In [24]:
def find_worst_games(metric):
return (nonnull_reviews_df.sort_values(
metric
)[:5][['game_name', 'pct_positive_reviews_all_time', 'metacritic_score']])
In [25]:
find_worst_games('pct_positive_reviews_all_time')
Out[25]:
In [26]:
find_worst_games('metacritic_score')
Out[26]:
In [49]:
def find_best_games(metric):
return (nonnull_reviews_df.sort_values(
metric, ascending=False
)[:5][['game_name', 'pct_positive_reviews_all_time', 'metacritic_score']])
In [50]:
find_best_games('pct_positive_reviews_all_time')
Out[50]:
In [51]:
find_best_games('metacritic_score')
Out[51]: