In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:
%matplotlib inline

In [3]:
def level_up(n, base, increment):
    x = range(n)
    df = pd.DataFrame({
        "level" : x,
        "stat" : [base+(increment*i) for i in range(n)]
    })
    return(df)

Load yesterday's data


In [4]:
# Load data
dat = pd.read_csv("lol_base_stats.tsv", sep="\t")
dat.head()


Out[4]:
Champions HP HP+ HP5 HP5+ MP MP+ MP5 MP5+ AD AD+ AS AS+ AR AR+ MR MR+ MS Range href
0 Aatrox 537.80 85.0 6.590 0.50 100.00 0.0 0.00 0.000 60.3760 3.20 0.651 0.03000 24.384 3.8 32.1 1.25 345.0 150.0 http://leagueoflegends.wikia.com/wiki/Aatrox
1 Ahri 514.40 80.0 6.505 0.60 334.00 50.0 6.00 0.800 53.0400 3.00 0.668 0.02000 20.880 3.5 30.0 0.00 330.0 550.0 http://leagueoflegends.wikia.com/wiki/Ahri
2 Akali 587.80 85.0 8.340 0.65 200.00 0.0 50.00 0.000 58.3760 3.20 0.694 0.03100 26.380 3.5 32.1 1.25 350.0 125.0 http://leagueoflegends.wikia.com/wiki/Akali
3 Alistar 613.36 106.0 8.675 0.85 278.84 38.0 8.50 0.800 61.1116 3.62 0.625 0.02125 24.380 3.5 32.1 1.25 330.0 125.0 http://leagueoflegends.wikia.com/wiki/Alistar
4 Amumu 613.12 84.0 8.875 0.85 287.20 40.0 7.38 0.525 53.3840 3.80 0.638 0.02180 23.544 3.8 32.1 1.25 335.0 125.0 http://leagueoflegends.wikia.com/wiki/Amumu

Add class data

Continue scraping the web to add primary role


In [5]:
from bs4 import BeautifulSoup
import requests

primary_role = []
for url in dat.href:
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data, "html5lib")
    
    role = soup.find('div', attrs={'class' : 'champion_info'}).table.a.text
    primary_role.append(role)

In [6]:
dat["primary_role"] = primary_role
dat.head()


Out[6]:
Champions HP HP+ HP5 HP5+ MP MP+ MP5 MP5+ AD ... AS AS+ AR AR+ MR MR+ MS Range href primary_role
0 Aatrox 537.80 85.0 6.590 0.50 100.00 0.0 0.00 0.000 60.3760 ... 0.651 0.03000 24.384 3.8 32.1 1.25 345.0 150.0 http://leagueoflegends.wikia.com/wiki/Aatrox Fighter
1 Ahri 514.40 80.0 6.505 0.60 334.00 50.0 6.00 0.800 53.0400 ... 0.668 0.02000 20.880 3.5 30.0 0.00 330.0 550.0 http://leagueoflegends.wikia.com/wiki/Ahri Mage
2 Akali 587.80 85.0 8.340 0.65 200.00 0.0 50.00 0.000 58.3760 ... 0.694 0.03100 26.380 3.5 32.1 1.25 350.0 125.0 http://leagueoflegends.wikia.com/wiki/Akali Slayer
3 Alistar 613.36 106.0 8.675 0.85 278.84 38.0 8.50 0.800 61.1116 ... 0.625 0.02125 24.380 3.5 32.1 1.25 330.0 125.0 http://leagueoflegends.wikia.com/wiki/Alistar Tank
4 Amumu 613.12 84.0 8.875 0.85 287.20 40.0 7.38 0.525 53.3840 ... 0.638 0.02180 23.544 3.8 32.1 1.25 335.0 125.0 http://leagueoflegends.wikia.com/wiki/Amumu Tank

5 rows × 21 columns


In [79]:
# Save data
dat.to_csv("lol_base_stats-roles.tsv", index=False, sep="\t")

In [8]:
# Define colors
my_colors = ['b', 'r', 'm', 'g', 'k', 'y']
my_colors_key = {
    'Controller' : 'b',
    'Fighter' : 'r',
    'Mage' : 'm',
    'Marksman' : 'g',
    'Slayer' : 'k',
    'Tank' : 'y'
}

In [9]:
plt.rcParams["figure.figsize"] = [10,4]

# How many champions of each type?
dat.groupby(["primary_role"]).count()["Champions"].plot.bar(color=my_colors)
plt.ylabel("count")
plt.xlabel("Primary role (according to wikia's Base champion stats)")


Out[9]:
<matplotlib.text.Text at 0x1103135c0>

Visualizing high-dimensional data


In [11]:
# Use only complete cases
datc = pd.DataFrame.dropna(dat)
datc = datc.iloc[:, 1:-2]

t-distributed Stochastic Neighbor Embedding (TSNE)

t-SNE is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results.


In [14]:
# Plot t-SNE at different perplexities

plt.rcParams["figure.figsize"] = [15,15]
nrows = 4
ncols = 4
fig, ax = plt.subplots(nrows, ncols)

perplexity = list(range(50, 4, -3))
    
for i in range(nrows):
    for j in range(ncols):
        
        p = perplexity.pop()    
        
        # Run TSNE
        model = TSNE(n_components=2, perplexity=p, random_state=0)
        X = model.fit_transform(datc)
        
        xlabel = "TNSE1"
        ylabel = "TNSE2"

        for k in my_colors_key.keys():
            X_subset = X[dat.dropna()["primary_role"] == k,]
            x = X_subset[:,0]
            y = X_subset[:,1]
            ax[i,j].scatter(x, y, color=my_colors_key[k])

        ax[i,j].title.set_text("perplexity = {}".format(p))
        ax[i,j].set(xlabel=xlabel, ylabel=ylabel)


Principal component analysis (PCA)

PCA, another dimensionality reduction algorithm.

Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset. It's often used to make data easy to explore and visualize (Victor Powell)


In [15]:
plt.rcParams["figure.figsize"] = [6,6]

fig, ax = plt.subplots(1, 1)

# Run PCA
pca = PCA(n_components=2)
pca.fit(datc)
X = pca.transform(datc)

xlabel = "PC1"
ylabel = "PC2"

for k in my_colors_key.keys():
    X_subset = X[dat.dropna()["primary_role"] == k,]
    x = X_subset[:,0]
    y = X_subset[:,1]
    ax.scatter(x, y, color=my_colors_key[k])

ax.set(xlabel=xlabel, ylabel=ylabel)


Out[15]:
[<matplotlib.text.Text at 0x110fed978>, <matplotlib.text.Text at 0x111a5ca20>]