In [1]:
import re
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
from ipywidgets import interact
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import matplotlib
import matplotlib.pyplot as plt
import json
%matplotlib inline
import findspark
findspark.init()
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pandas.plotting import scatter_matrix
from datetime import datetime, timedelta
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
In [9]:
p_df = pd.read_csv('pokemon.csv')
c_df = pd.read_csv('combats.csv')
display(p_df.head(5))
display(c_df.head(5))
In [8]:
display(p_df.describe())
display(c_df.describe())
In [16]:
display(p_df['Class 1'].unique())
display(p_df['Class 2'].unique())
In [17]:
p_df.hist(column = 'Attack')
p_df.hist(column = 'Defense')
Out[17]:
In [38]:
ax = p_df.hist(column='Sp. Atk', alpha = 0.5)
p_df.hist(column='Sp. Def', ax = ax, alpha = 0.5)
plt.legend(['Sp. Atk', 'Sp. Def'])
plt.title("Sp. Atk + Sp. Def")
Out[38]:
In [39]:
p_df.plot(kind = 'scatter', x = 'Sp. Atk', y = 'Sp. Def')
Out[39]:
In [55]:
p_df['Attack/Defense'] = p_df['Attack'] / p_df['Defense']
display(p_df.sort_values(by=['Attack/Defense'], ascending = False)[:3])
print("list the names of the 3 Pokémon with highest attack-over-defense ratio:\n")
print("\n".join(p_df.sort_values(by=['Attack/Defense'], ascending = False)[:3]['Name'].tolist()))
display(p_df.sort_values(by=['Attack/Defense'], ascending = True)[:3])
print("list the names of the 3 Pokémon with lowest attack-over-defense ratio:\n")
print("\n".join(p_df.sort_values(by=['Attack/Defense'], ascending = True)[:3]['Name'].tolist()))
In [79]:
display(c_df.head(5))
print('list the names of the 10 Pokémon with the largest number of victories.\n')
top_df = c_df.groupby('Winner').size().reset_index(name='counts').sort_values(by='counts', ascending = False)[:10]
print("\n".join(top_df.merge(p_df, left_on = 'Winner', right_on = 'pid')['Name'].tolist()))
In [110]:
grass_class = p_df[(p_df['Class 1'] == 'Grass') | (p_df['Class 2'] == 'Grass') &
~((p_df['Class 1'] != 'Rock') | (p_df['Class 2'] == 'Rock'))]
rock_class = p_df[(p_df['Class 1'] == 'Rock') | (p_df['Class 2'] == 'Rock') &
~((p_df['Class 1'] != 'Grass') | (p_df['Class 2'] == 'Grass'))]
display(grass_class.head(5))
display(rock_class.head(5))
f, (ax1, ax2) = plt.subplots(1, 2, sharey = True)
grass_class.boxplot(column = 'Attack', return_type='axes', ax = ax1)
rock_class.boxplot(column = 'Attack', ax = ax2)
Out[110]:
In [113]:
spark.sql("""
SELECT Pokemons.Winner, Pokemons.Name, COUNT(*) as TotalWins
FROM Combats
INNER JOIN Pokemons on Pokemons.pid = Combats.Winner
GROUP BY Pokemnon.Winner, Pokemons.Name
ORDER BY TotalWins DESC
""")
In [209]:
X_ext = c_df.merge(p_df, left_on='First_pokemon', right_on='pid') \
.merge(p_df, left_on='Second_pokemon', right_on='pid', suffixes=('_x', '_y'))
X = X_ext.drop(columns=['Winner', 'First_pokemon', 'Second_pokemon', 'pid_x', 'pid_y', 'Name_x', 'Name_y', 'Attack/Defense_x', 'Attack/Defense_y'])
categories = pd.unique(p_df[['Class 1', 'Class 2']].values.ravel('K'))[:-1]
X['Class 1_x'] = pd.Categorical(X['Class 1_x'], categories=categories).codes
X['Class 1_y'] = pd.Categorical(X['Class 1_y'], categories=categories).codes
X['Class 2_x'] = pd.Categorical(X['Class 2_x'], categories=categories).codes
X['Class 2_y'] = pd.Categorical(X['Class 2_y'], categories=categories).codes
display(X)
Y = X_ext['Winner'] == X_ext['First_pokemon']
In [210]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
In [211]:
N = len(X)
N
Out[211]:
In [212]:
train_size = int(N * 0.9)
test_size = N - train_size
permutation = np.random.permutation(N)
train_set_index = permutation[:train_size]
test_set_index = permutation[train_size:]
print(train_set_index)
print(test_set_index)
In [213]:
X_train = X.iloc[train_set_index]
Y_train = Y.iloc[train_set_index]
X_test = X.iloc[test_set_index]
Y_test = Y.iloc[test_set_index]
In [231]:
n_estimators = [10, 25, 50, 100]
max_depths = [2, 4, 10]
def k_fold(X, Y, K):
permutation = np.random.permutation(N)
for k in range(K):
X_test = X.iloc[permutation[k * test_size : (k + 1) * test_size]]
Y_test = Y.iloc[permutation[k * test_size : (k + 1) * test_size]]
X_train = X.iloc[permutation[:k*test_size].tolist() + permutation[(k + 1)*test_size:].tolist()]
Y_train = Y.iloc[permutation[:k*test_size].tolist() + permutation[(k + 1)*test_size:].tolist()]
yield(X_train, Y_train, X_test, Y_test)
best_acc = 0
best_n_est = 0
best_max_depth = 0
for n_estimator in n_estimators:
for max_depth in max_depths:
clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=0)
accuracies = []
for (X_train, Y_train, X_test, Y_test) in k_fold(X, Y, 5):
clf.fit(X_train, Y_train)
accuracies.append((clf.predict(X_test) == Y_test).sum() / test_size)
accuracy = np.mean(accuracies)
print(n_estimator, max_depth, accuracy)
if accuracy > best_acc:
best_acc = accuracy
best_n_est = n_estimator
best_max_depth = max_depth
print('Best accuracy: ', best_acc)
print('Best number of estimators: ', best_n_est)
print('Best max depth: ', best_max_depth)
In [234]:
forest = RandomForestClassifier(n_estimators=best_n_est, max_depth=best_max_depth, random_state=0)
forest.fit(X_train, Y_train)
Out[234]:
In [244]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%s) (%f)" % (f + 1, indices[f], X.columns[indices[f]], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
(5 points) Compute the winning ratio (number of wins divided by number of battles) for all Pokémon. Show the 10 Pokémon with the highest ratio and describe what they have in common with respect to their features. Discuss your results about feature importance from question 2.7 (regarding feature importance) in this context.l
In [274]:
top_df = c_df.groupby('Winner').size().reset_index(name='WinCount').sort_values(by='WinCount', ascending = False)
first_df = c_df.groupby('First_pokemon').size().reset_index(name='Battles').sort_values(by='Battles', ascending = False)
second_df = c_df.groupby('Second_pokemon').size().reset_index(name='Battles').sort_values(by='Battles', ascending = False)
merged = first_df.merge(second_df, left_on = 'First_pokemon', right_on='Second_pokemon')
merged['Battles'] = merged['Battles_x'] + merged['Battles_y']
merged = merged.drop(columns = ['Second_pokemon', 'Battles_x', "Battles_y"])
p_df_ext = p_df.merge(top_df, left_on='pid', right_on='Winner')
p_df_ext = p_df_ext.merge(merged, left_on='pid', right_on='First_pokemon')
p_df_ext = p_df_ext.drop(columns = ['First_pokemon', 'Winner'])
p_df_ext["WinninRatio"] = p_df_ext['WinCount'] / p_df_ext['Battles']
display(p_df_ext.head(5))
In [277]:
p_df_ext.sort_values(by = 'WinninRatio', ascending = False)[:10]
Out[277]:
In [278]:
p_df_ext.describe()
Out[278]:
In [293]:
wins = np.zeros(shape = (800, 800))
for row in c_df.iterrows():
if row[1]['First_pokemon'] == row[1]['Winner']:
wins[row[1]['First_pokemon'] - 1][row[1]['Second_pokemon'] - 1] += 1
else:
wins[row[1]['Second_pokemon'] - 1][row[1]['First_pokemon'] - 1] += 1
In [294]:
G = np.zeros(shape = (800, 800))
for i in range(800):
for j in range(800):
if wins[i][j] > wins[j][i]:
G[i][j] = 1
elif wins[i][j] > wins[j][i]:
G[j][i] = 1
A = G + (G @ G)
In [310]:
scores = A.sum(axis = 1)
p_df[p_df['pid'].isin(np.argsort(scores)[-10:])]
Out[310]:
In [ ]: