In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)
In [2]:
data = pd.read_csv('../data/ratings_sample.tsv', sep='\t')
In [3]:
data.columns
Out[3]:
In [4]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["player_age"].dropna(),
bins=100,
kde=False,
color="tomato")
sns.plt.title("Player Age Distribution")
plt.ylabel("Count")
Normalize the outliers for player age
In [5]:
data.loc[data.player_age < 18,"player_age"] = data.loc[(data.player_age >= 18) & (data.player_age <= 30),"player_age"].mean(skipna=True)
data.loc[data.player_age > 100,"player_age"] = data.loc[(data.player_age >= 30) & (data.player_age <= 100),"player_age"].mean(skipna=True)
data["player_age"].fillna(data["player_age"].mean(),inplace=True)
data["player_age"] = data["player_age"].astype(int)
Histogram after player age normalization
In [7]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["player_age"].dropna(),
bins=40,
kde=False,
color="tomato")
sns.plt.title("Age Distribution")
plt.ylabel("Count")
plt.xlim((15,100))
In [8]:
unique_player_metros = data['player_metro'].unique()
print(unique_player_metros)
In [9]:
unique_subject_metros = data['subject_metro'].unique()
print(unique_subject_metros)
In [10]:
unique_player_location = data['player_location'].unique()
print(unique_player_location)
In [11]:
unique_subject_location = data['subject_location'].unique()
print(unique_subject_location)
In [12]:
unique_subject_metro_likes = data.groupby(['subject_metro'])['like'].agg(['count'])
print((unique_subject_metro_likes))
In [23]:
unique_subject_metro_likes = unique_subject_metro_likes.reset_index()
unique_subject_metro_likes['subject_metro'] = unique_subject_metro_likes['subject_metro'].astype('str')
Subject Age Histogram
In [13]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["subject_age"].dropna(),
bins=80,
kde=False,
color="green")
sns.plt.title("Subject Age Distribution")
plt.ylabel("Count")
Player Height Histogram
In [14]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["player_height"].dropna(),
bins=40,
kde=False,
color="blue")
sns.plt.title("Age Distribution")
plt.ylabel("Count")
Player Age Minimum Histogram
In [18]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["player_age_min"].dropna(),
bins=40,
kde=False,
color="tomato")
sns.plt.title("Player Age Min Distribution")
plt.ylabel("Count")
Player Age Max Histogram
In [19]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["player_age_max"].dropna(),
bins=40,
kde=False,
color="tomato")
sns.plt.title("Player Age Max Distribution")
plt.ylabel("Count")
Subject Age Min Distribution
In [20]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["subject_age_min"].dropna(),
bins=40,
kde=False,
color="green")
sns.plt.title("Subject Age Min Distribution")
plt.ylabel("Count")
Subject Age Max Distribution
In [21]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["subject_age_max"].dropna(),
bins=40,
kde=False,
color="green")
sns.plt.title("Subject Age Max Distribution")
plt.ylabel("Count")
Distance Distribution
In [22]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(data["distance"].dropna(),
bins=5,
kde=False,
color="green")
sns.plt.title("Distance Distribution")
plt.ylabel("Count")
Calculate the ratio of likes by player
Calculate the unique players & subjects
Assign median value of likes ratio to players that have played less than median players
In [24]:
player_likes = data.groupby(['player_id'])['like'].agg(['count','sum'])
In [25]:
unique_players = data['player_id'].unique()
print(len(unique_players))
In [26]:
unique_subjects = data['subject_id'].unique()
print(len(unique_subjects))
In [27]:
player_likes['likes_ratio'] = player_likes['sum']/player_likes['count']
In [28]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(player_likes["likes_ratio"].dropna(),
bins=100,
kde=False,
color="magenta")
sns.plt.title("Likes Distribution For Players")
plt.ylabel("Count")
In [29]:
player_likes_median = player_likes['count'].median()
In [30]:
player_likes_median_likes_ratio = player_likes['likes_ratio'].median()
print(player_likes_median_likes_ratio)
In [31]:
player_likes_1_ratio = player_likes[(player_likes['likes_ratio'] > 0.9) & (player_likes['count'] > player_likes_median)]
In [32]:
player_likes.loc[((player_likes['likes_ratio'] > 0.9) & (player_likes['count'] < player_likes_median)), 'likes_ratio'] = player_likes_median_likes_ratio
In [33]:
player_likes = player_likes[['likes_ratio']]
Look at the distance distribution for liking/ not liking a subject
In [38]:
player_distance_likes = data[data['like'] > 0]
player_distance_likes = player_distance_likes[['distance']]
In [39]:
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(player_distance_likes["distance"].dropna(),
bins=40,
kde=False,
color="blue")
sns.plt.title("Distance Distribution for liked subjects")
plt.ylabel("Count")
In [40]:
player_distance_not_likes = data[data['like'] < 1]
player_distance_not_likes = player_distance_not_likes[['distance']]
with sns.plotting_context("notebook",font_scale=1.5):
sns.set_style("whitegrid")
sns.distplot(player_distance_not_likes["distance"].dropna(),
bins=40,
kde=False,
color="blue")
sns.plt.title("Distance Distribution for not liked subjects")
plt.ylabel("Count")
Find gender orientations and get the ratio of likes by player gender orientation
In [41]:
player_genders = data['player_gender_orientation'].unique()
print(len(player_genders))
print(player_genders)
In [42]:
player_likes_by_gender = data.groupby(['player_gender_orientation'])['like'].agg(['count','sum'])
player_likes_by_gender['ratio'] = player_likes_by_gender['sum']/player_likes_by_gender['count']
print(player_likes_by_gender)
In [43]:
player_likes_by_gender = player_likes_by_gender[['ratio']]
Merging the likes ratio and the gender ratio to create a new dataset
In [44]:
data_merge_one = pd.merge(data, player_likes_by_gender, left_on='player_gender_orientation', right_index=True, how='inner')
In [45]:
data_merge_two = pd.merge(data_merge_one, player_likes, left_on='player_id', right_index=True, how='inner')
In [46]:
data_merge_two.to_csv('../data/processed_data.csv')
In [ ]: