In [1]:
import re
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
from ipywidgets import interact

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

import matplotlib
import matplotlib.pyplot as plt
import json
%matplotlib inline

import findspark
findspark.init()

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min

from pyspark.sql import SparkSession
from pyspark import SparkContext

from pandas.plotting import scatter_matrix
from datetime import datetime, timedelta

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [9]:
p_df = pd.read_csv('pokemon.csv')
c_df = pd.read_csv('combats.csv')

display(p_df.head(5))
display(c_df.head(5))


pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary
0 1 Bulbasaur Grass Poison 45 49 49 65 65 45 False
1 2 Ivysaur Grass Poison 60 62 63 80 80 60 False
2 3 Venusaur Grass Poison 80 82 83 100 100 80 False
3 4 Mega Venusaur Grass Poison 80 100 123 122 120 80 False
4 5 Charmander Fire NaN 39 52 43 60 50 65 False
First_pokemon Second_pokemon Winner
0 266 298 298
1 702 701 701
2 191 668 668
3 237 683 683
4 151 231 151

In [8]:
display(p_df.describe())
display(c_df.describe())


pid HP Attack Defense Sp. Atk Sp. Def Speed
count 800.0000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000
mean 400.5000 69.258750 79.001250 73.842500 72.820000 71.902500 68.277500
std 231.0844 25.534669 32.457366 31.183501 32.722294 27.828916 29.060474
min 1.0000 1.000000 5.000000 5.000000 10.000000 20.000000 5.000000
25% 200.7500 50.000000 55.000000 50.000000 49.750000 50.000000 45.000000
50% 400.5000 65.000000 75.000000 70.000000 65.000000 70.000000 65.000000
75% 600.2500 80.000000 100.000000 90.000000 95.000000 90.000000 90.000000
max 800.0000 255.000000 190.000000 230.000000 194.000000 230.000000 180.000000
First_pokemon Second_pokemon Winner
count 50000.000000 50000.000000 50000.000000
mean 400.495640 403.159660 408.890100
std 229.549429 230.083644 231.159961
min 1.000000 1.000000 1.000000
25% 203.000000 207.000000 206.000000
50% 399.000000 402.000000 422.000000
75% 597.250000 602.000000 606.000000
max 800.000000 800.000000 800.000000

In [16]:
display(p_df['Class 1'].unique())
display(p_df['Class 2'].unique())


array(['Grass', 'Fire', 'Water', 'Bug', 'Normal', 'Poison', 'Electric',
       'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Ghost', 'Ice',
       'Dragon', 'Dark', 'Steel', 'Flying'], dtype=object)
array(['Poison', nan, 'Flying', 'Dragon', 'Ground', 'Fairy', 'Grass',
       'Fighting', 'Psychic', 'Steel', 'Ice', 'Rock', 'Dark', 'Water',
       'Electric', 'Fire', 'Ghost', 'Bug', 'Normal'], dtype=object)

In [17]:
p_df.hist(column = 'Attack')
p_df.hist(column = 'Defense')


Out[17]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1097a0da0>]],
      dtype=object)

In [38]:
ax = p_df.hist(column='Sp. Atk', alpha = 0.5)
p_df.hist(column='Sp. Def', ax = ax, alpha = 0.5)
plt.legend(['Sp. Atk', 'Sp. Def'])
plt.title("Sp. Atk + Sp. Def")


Out[38]:
Text(0.5, 1.0, 'Sp. Atk + Sp. Def')

In [39]:
p_df.plot(kind = 'scatter', x = 'Sp. Atk', y = 'Sp. Def')


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a24feee48>

In [55]:
p_df['Attack/Defense'] = p_df['Attack'] / p_df['Defense']
display(p_df.sort_values(by=['Attack/Defense'], ascending = False)[:3])
print("list the names of the 3 Pokémon with highest attack-over-defense ratio:\n")
print("\n".join(p_df.sort_values(by=['Attack/Defense'], ascending = False)[:3]['Name'].tolist()))
display(p_df.sort_values(by=['Attack/Defense'], ascending = True)[:3])
print("list the names of the 3 Pokémon with lowest attack-over-defense ratio:\n")
print("\n".join(p_df.sort_values(by=['Attack/Defense'], ascending = True)[:3]['Name'].tolist()))


pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense
429 430 DeoxysAttack Forme Psychic NaN 50 180 20 180 20 150 True 9.00
347 348 Carvanha Water Dark 45 90 20 65 20 65 False 4.50
19 20 Mega Beedrill Bug Poison 65 150 40 15 80 145 False 3.75
list the names of the 3 Pokémon with highest attack-over-defense ratio:

DeoxysAttack Forme
Carvanha
Mega Beedrill
pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense
230 231 Shuckle Bug Rock 20 10 230 10 230 5 False 0.043478
139 140 Magikarp Water NaN 20 10 55 15 20 80 False 0.181818
484 485 Bronzor Steel Psychic 57 24 86 24 86 23 False 0.279070
list the names of the 3 Pokémon with lowest attack-over-defense ratio:

Shuckle
Magikarp
Bronzor

In [79]:
display(c_df.head(5))

print('list the names of the 10 Pokémon with the largest number of victories.\n')
top_df = c_df.groupby('Winner').size().reset_index(name='counts').sort_values(by='counts', ascending = False)[:10]
print("\n".join(top_df.merge(p_df, left_on = 'Winner', right_on = 'pid')['Name'].tolist()))


First_pokemon Second_pokemon Winner
0 266 298 298
1 702 701 701
2 191 668 668
3 237 683 683
4 151 231 151
list the names of the 10 Pokémon with the largest number of victories.

Mewtwo
Aerodactyl
Infernape
Jirachi
Deoxys Speed Forme
Slaking
Murkrow
Mega Absol
Mega Houndoom
Mega Aerodactyl

In [110]:
grass_class = p_df[(p_df['Class 1'] == 'Grass') | (p_df['Class 2'] == 'Grass') &
                   ~((p_df['Class 1'] != 'Rock') | (p_df['Class 2'] == 'Rock'))]
rock_class = p_df[(p_df['Class 1'] == 'Rock') | (p_df['Class 2'] == 'Rock') &
                  ~((p_df['Class 1'] != 'Grass') | (p_df['Class 2'] == 'Grass'))]
display(grass_class.head(5))
display(rock_class.head(5))

f, (ax1, ax2) = plt.subplots(1, 2, sharey = True)
grass_class.boxplot(column = 'Attack', return_type='axes', ax = ax1)
rock_class.boxplot(column = 'Attack', ax = ax2)


pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense
0 1 Bulbasaur Grass Poison 45 49 49 65 65 45 False 1.000000
1 2 Ivysaur Grass Poison 60 62 63 80 80 60 False 0.984127
2 3 Venusaur Grass Poison 80 82 83 100 100 80 False 0.987952
3 4 Mega Venusaur Grass Poison 80 100 123 122 120 80 False 0.813008
48 49 Oddish Grass Poison 45 50 55 75 65 30 False 0.909091
pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense
80 81 Geodude Rock Ground 40 80 100 30 30 20 False 0.800000
81 82 Graveler Rock Ground 55 95 115 45 45 35 False 0.826087
82 83 Golem Rock Ground 80 120 130 55 65 45 False 0.923077
103 104 Onix Rock Ground 35 45 160 30 45 70 False 0.281250
149 150 Omanyte Rock Water 35 40 100 90 55 35 False 0.400000
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a274102b0>

In [113]:
spark.sql("""
    SELECT Pokemons.Winner, Pokemons.Name, COUNT(*) as TotalWins
    FROM Combats
    INNER JOIN Pokemons on Pokemons.pid = Combats.Winner
    GROUP BY Pokemnon.Winner, Pokemons.Name
    ORDER BY TotalWins DESC
""")


-------------------------------------------------------------------
Py4JJavaError                     Traceback (most recent call last)
~/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:

Py4JJavaError: An error occurred while calling o23.sql.
: org.apache.spark.sql.AnalysisException: Table or view not found: Combats; line 3 pos 9
	at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:47)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupTableFromCatalog(Analyzer.scala:665)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.resolveRelation(Analyzer.scala:617)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:647)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:640)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:288)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:640)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:586)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:124)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:118)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:103)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 'combats' not found in database 'default';
	at org.apache.spark.sql.catalyst.catalog.ExternalCatalog.requireTableExists(ExternalCatalog.scala:46)
	at org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.getTable(InMemoryCatalog.scala:326)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupRelation(SessionCatalog.scala:669)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupTableFromCatalog(Analyzer.scala:662)
	... 54 more


During handling of the above exception, another exception occurred:

AnalysisException                 Traceback (most recent call last)
<ipython-input-113-6ca29bab6293> in <module>()
      5     GROUP BY Pokemnon.Winner, Pokemons.Name
      6     ORDER BY TotalWins DESC
----> 7 """)

~/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)
    708         [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
    709         """
--> 710         return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
    711 
    712     @since(2.0)

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

~/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     67                                              e.java_exception.getStackTrace()))
     68             if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69                 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
     70             if s.startswith('org.apache.spark.sql.catalyst.analysis'):
     71                 raise AnalysisException(s.split(': ', 1)[1], stackTrace)

AnalysisException: 'Table or view not found: Combats; line 3 pos 9'

In [209]:
X_ext = c_df.merge(p_df, left_on='First_pokemon', right_on='pid') \
    .merge(p_df, left_on='Second_pokemon', right_on='pid', suffixes=('_x', '_y'))
X = X_ext.drop(columns=['Winner', 'First_pokemon', 'Second_pokemon', 'pid_x', 'pid_y', 'Name_x', 'Name_y', 'Attack/Defense_x', 'Attack/Defense_y'])

categories = pd.unique(p_df[['Class 1', 'Class 2']].values.ravel('K'))[:-1]

X['Class 1_x'] = pd.Categorical(X['Class 1_x'], categories=categories).codes
X['Class 1_y'] = pd.Categorical(X['Class 1_y'], categories=categories).codes

X['Class 2_x'] = pd.Categorical(X['Class 2_x'], categories=categories).codes
X['Class 2_y'] = pd.Categorical(X['Class 2_y'], categories=categories).codes

display(X)
Y = X_ext['Winner'] == X_ext['First_pokemon']


Class 1_x Class 2_x HP_x Attack_x Defense_x Sp. Atk_x Sp. Def_x Speed_x Legendary_x Class 1_y Class 2_y HP_y Attack_y Defense_y Sp. Atk_y Sp. Def_y Speed_y Legendary_y
0 11 7 50 64 50 45 50 41 False 0 15 70 70 40 60 40 60 False
1 9 -1 70 80 50 35 35 35 False 0 15 70 70 40 60 40 60 False
2 2 7 110 78 73 76 71 60 False 0 15 70 70 40 60 40 60 False
3 6 17 50 65 107 105 107 86 False 0 15 70 70 40 60 40 60 False
4 4 -1 73 115 60 60 60 90 False 0 15 70 70 40 60 40 60 False
5 0 16 44 50 91 24 86 10 False 0 15 70 70 40 60 40 60 False
6 1 4 62 50 58 73 54 72 False 0 15 70 70 40 60 40 60 False
7 10 -1 70 55 95 95 110 65 False 0 15 70 70 40 60 40 60 False
8 2 -1 80 92 65 65 80 68 False 0 15 70 70 40 60 40 60 False
9 0 -1 100 100 125 110 50 50 False 0 15 70 70 40 60 40 60 False
10 10 17 72 58 80 103 80 97 False 0 15 70 70 40 60 40 60 False
11 4 -1 85 80 70 135 75 90 False 0 15 70 70 40 60 40 60 False
12 5 -1 100 73 83 73 83 55 False 0 15 70 70 40 60 40 60 False
13 5 -1 100 73 83 73 83 55 False 0 15 70 70 40 60 40 60 False
14 4 17 40 60 30 31 31 70 False 0 15 70 70 40 60 40 60 False
15 10 -1 75 75 75 125 95 40 False 0 15 70 70 40 60 40 60 False
16 5 3 40 50 90 30 55 65 False 0 15 70 70 40 60 40 60 False
17 12 0 85 100 122 58 75 54 False 0 15 70 70 40 60 40 60 False
18 3 12 1 90 45 30 30 40 False 0 15 70 70 40 60 40 60 False
19 1 17 106 130 90 110 154 90 True 0 15 70 70 40 60 40 60 False
20 5 7 90 92 87 75 85 76 False 0 15 70 70 40 60 40 60 False
21 5 2 50 60 60 60 60 30 False 0 15 70 70 40 60 40 60 False
22 0 -1 75 89 85 55 65 36 False 0 15 70 70 40 60 40 60 False
23 3 0 35 70 55 45 55 25 False 0 15 70 70 40 60 40 60 False
24 4 8 50 20 40 20 40 20 False 0 15 70 70 40 60 40 60 False
25 4 8 115 45 20 45 25 20 False 0 15 70 70 40 60 40 60 False
26 1 -1 90 110 80 100 80 95 False 0 15 70 70 40 60 40 60 False
27 2 -1 41 56 40 62 44 71 False 0 15 70 70 40 60 40 60 False
28 2 5 40 40 35 50 100 70 False 0 15 70 70 40 60 40 60 False
29 1 -1 45 60 40 70 50 45 False 0 15 70 70 40 60 40 60 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49970 1 -1 65 63 45 45 45 45 False 0 13 60 62 50 62 60 40 False
49971 14 -1 41 64 45 50 50 50 False 0 13 60 62 50 62 60 40 False
49972 12 -1 40 70 130 60 130 25 False 0 13 60 62 50 62 60 40 False
49973 6 -1 75 100 63 80 63 116 False 0 13 60 62 50 62 60 40 False
49974 6 -1 70 75 60 105 60 105 False 0 13 60 62 50 62 60 40 False
49975 3 16 70 150 140 65 100 75 False 0 13 60 62 50 62 60 40 False
49976 1 16 91 90 106 130 106 77 True 0 13 60 62 50 62 60 40 False
49977 12 1 60 40 60 95 60 55 False 0 13 60 62 50 62 60 40 False
49978 6 -1 55 40 40 65 45 35 False 0 13 60 62 50 62 60 40 False
49979 10 -1 50 70 160 70 160 90 True 0 13 60 62 50 62 60 40 False
49980 11 16 60 52 168 47 138 30 False 0 13 60 62 50 62 60 40 False
49981 16 11 70 110 180 60 60 50 False 0 13 60 62 50 62 60 40 False
49982 1 -1 55 70 45 70 50 60 False 0 13 60 62 50 62 60 40 False
49983 0 -1 60 62 80 63 80 60 False 0 13 60 62 50 62 60 40 False
49984 11 16 60 55 145 75 150 40 False 0 13 60 62 50 62 60 40 False
49985 4 9 65 136 94 54 96 135 False 0 13 60 62 50 62 60 40 False
49986 2 -1 55 65 35 60 30 85 False 0 13 60 62 50 62 60 40 False
49987 12 -1 64 165 75 93 83 75 False 0 13 60 62 50 62 60 40 False
49988 7 17 65 75 105 35 65 85 False 0 13 60 62 50 62 60 40 False
49989 12 17 150 80 44 90 54 80 False 0 13 60 62 50 62 60 40 False
49990 6 -1 85 115 80 105 80 50 False 0 13 60 62 50 62 60 40 False
49991 9 17 78 92 75 74 63 118 False 0 13 60 62 50 62 60 40 False
49992 9 17 78 92 75 74 63 118 False 0 13 60 62 50 62 60 40 False
49993 4 -1 120 120 120 120 120 120 True 0 13 60 62 50 62 60 40 False
49994 0 8 60 67 85 77 75 116 False 0 13 60 62 50 62 60 40 False
49995 3 -1 38 35 40 27 25 35 False 0 13 60 62 50 62 60 40 False
49996 0 17 55 45 50 45 65 80 False 0 13 60 62 50 62 60 40 False
49997 4 -1 110 160 110 80 110 100 True 0 13 60 62 50 62 60 40 False
49998 4 -1 104 91 63 91 73 68 False 0 13 60 62 50 62 60 40 False
49999 4 -1 55 70 55 40 55 85 False 0 13 60 62 50 62 60 40 False

50000 rows × 18 columns


In [210]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [211]:
N = len(X)
N


Out[211]:
50000

In [212]:
train_size = int(N * 0.9)
test_size = N - train_size
permutation = np.random.permutation(N)
train_set_index = permutation[:train_size]
test_set_index = permutation[train_size:]

print(train_set_index)
print(test_set_index)


[21675 20249 15069 ... 36014 19785 39193]
[42619  7774 43972 ... 46959 47569 25391]

In [213]:
X_train = X.iloc[train_set_index]
Y_train = Y.iloc[train_set_index]

X_test = X.iloc[test_set_index]
Y_test = Y.iloc[test_set_index]

In [231]:
n_estimators = [10, 25, 50, 100]
max_depths = [2, 4, 10]

def k_fold(X, Y, K):
    permutation = np.random.permutation(N)
    for k in range(K):
        X_test = X.iloc[permutation[k * test_size : (k + 1) * test_size]]
        Y_test = Y.iloc[permutation[k * test_size : (k + 1) * test_size]]
        
        X_train = X.iloc[permutation[:k*test_size].tolist() + permutation[(k + 1)*test_size:].tolist()]
        Y_train = Y.iloc[permutation[:k*test_size].tolist() + permutation[(k + 1)*test_size:].tolist()]
        yield(X_train, Y_train, X_test, Y_test)

best_acc = 0
best_n_est = 0
best_max_depth = 0
for n_estimator in n_estimators:
    for max_depth in max_depths:
        clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=0)
        accuracies = []
        for (X_train, Y_train, X_test, Y_test) in k_fold(X, Y, 5):
            clf.fit(X_train, Y_train)
            accuracies.append((clf.predict(X_test) == Y_test).sum() / test_size)
  
        accuracy = np.mean(accuracies)
        print(n_estimator, max_depth, accuracy)
        if accuracy > best_acc:
            best_acc = accuracy
            best_n_est = n_estimator
            best_max_depth = max_depth
            
print('Best accuracy: ', best_acc)
print('Best number of estimators: ', best_n_est)
print('Best max depth: ', best_max_depth)


10 2 0.7933200000000001
10 4 0.85808
10 10 0.9268000000000001
25 2 0.8106
25 4 0.857
25 10 0.9296399999999998
50 2 0.80632
50 4 0.86784
50 10 0.93212
100 2 0.80808
100 4 0.87744
100 10 0.93596
Best accuracy:  0.93596
Best number of estimators:  100
Best max depth:  10

In [234]:
forest = RandomForestClassifier(n_estimators=best_n_est, max_depth=best_max_depth, random_state=0)
forest.fit(X_train, Y_train)


Out[234]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [244]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%s) (%f)" % (f + 1, indices[f], X.columns[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()


Feature ranking:
1. feature 7 (Speed_x) (0.367536)
2. feature 16 (Speed_y) (0.348282)
3. feature 12 (Attack_y) (0.041669)
4. feature 3 (Attack_x) (0.038579)
5. feature 5 (Sp. Atk_x) (0.029665)
6. feature 14 (Sp. Atk_y) (0.026784)
7. feature 2 (HP_x) (0.020175)
8. feature 11 (HP_y) (0.017509)
9. feature 6 (Sp. Def_x) (0.017483)
10. feature 15 (Sp. Def_y) (0.017186)
11. feature 4 (Defense_x) (0.013039)
12. feature 13 (Defense_y) (0.012665)
13. feature 10 (Class 2_y) (0.011253)
14. feature 1 (Class 2_x) (0.010887)
15. feature 9 (Class 1_y) (0.008595)
16. feature 0 (Class 1_x) (0.007540)
17. feature 8 (Legendary_x) (0.006054)
18. feature 17 (Legendary_y) (0.005099)

(5 points) Compute the winning ratio (number of wins divided by number of battles) for all Pokémon. Show the 10 Pokémon with the highest ratio and describe what they have in common with respect to their features. Discuss your results about feature importance from question 2.7 (regarding feature importance) in this context.l


In [274]:
top_df = c_df.groupby('Winner').size().reset_index(name='WinCount').sort_values(by='WinCount', ascending = False)

first_df = c_df.groupby('First_pokemon').size().reset_index(name='Battles').sort_values(by='Battles', ascending = False)
second_df = c_df.groupby('Second_pokemon').size().reset_index(name='Battles').sort_values(by='Battles', ascending = False)
merged = first_df.merge(second_df, left_on = 'First_pokemon', right_on='Second_pokemon')

merged['Battles'] = merged['Battles_x'] + merged['Battles_y']
merged = merged.drop(columns = ['Second_pokemon', 'Battles_x', "Battles_y"])

p_df_ext = p_df.merge(top_df, left_on='pid', right_on='Winner')
p_df_ext = p_df_ext.merge(merged, left_on='pid', right_on='First_pokemon')
p_df_ext = p_df_ext.drop(columns = ['First_pokemon', 'Winner'])

p_df_ext["WinninRatio"] = p_df_ext['WinCount'] / p_df_ext['Battles']

display(p_df_ext.head(5))


pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense WinCount Battles WinninRatio
0 1 Bulbasaur Grass Poison 45 49 49 65 65 45 False 1.000000 37 133 0.278195
1 2 Ivysaur Grass Poison 60 62 63 80 80 60 False 0.984127 46 121 0.380165
2 3 Venusaur Grass Poison 80 82 83 100 100 80 False 0.987952 89 132 0.674242
3 4 Mega Venusaur Grass Poison 80 100 123 122 120 80 False 0.813008 70 125 0.560000
4 5 Charmander Fire NaN 39 52 43 60 50 65 False 1.209302 55 112 0.491071

In [277]:
p_df_ext.sort_values(by = 'WinninRatio', ascending = False)[:10]


Out[277]:
pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense WinCount Battles WinninRatio
147 155 Mega Aerodactyl Rock Flying 80 135 85 70 95 150 False 1.588235 127 129 0.984496
499 513 Weavile Dark Ice 70 120 65 45 85 125 False 1.846154 116 119 0.974790
687 704 Tornadus Therian Forme Flying NaN 79 100 80 110 90 121 True 1.250000 121 125 0.968000
18 20 Mega Beedrill Bug Poison 65 150 40 15 80 145 False 3.750000 115 119 0.966387
146 154 Aerodactyl Rock Flying 80 105 65 60 75 130 False 1.615385 136 141 0.964539
464 477 Mega Lopunny Normal Fighting 65 136 94 54 96 135 False 1.446809 124 129 0.961240
710 727 Greninja Water Dark 72 95 67 103 71 122 False 1.417910 122 127 0.960630
700 717 Meloetta Pirouette Forme Normal Fighting 100 128 90 77 77 128 False 1.422222 118 123 0.959350
157 165 Mega Mewtwo Y Psychic NaN 106 150 70 194 120 140 True 2.142857 119 125 0.952000
338 350 Mega Sharpedo Water Dark 70 140 70 110 65 105 False 2.000000 114 120 0.950000

In [278]:
p_df_ext.describe()


Out[278]:
pid HP Attack Defense Sp. Atk Sp. Def Speed Attack/Defense WinCount Battles WinninRatio
count 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000 783.000000
mean 402.873563 69.140485 79.007663 73.699872 72.839080 71.749681 68.443167 1.164354 63.856960 127.541507 0.501538
std 230.297452 25.348783 32.502566 30.879737 32.672868 27.248993 29.158076 0.552715 32.925941 11.397402 0.254993
min 1.000000 1.000000 5.000000 5.000000 10.000000 20.000000 5.000000 0.181818 3.000000 92.000000 0.021739
25% 204.500000 50.000000 55.000000 50.000000 50.000000 50.000000 45.000000 0.826267 36.000000 120.000000 0.284228
50% 403.000000 65.000000 75.000000 70.000000 65.000000 70.000000 65.000000 1.076923 62.000000 128.000000 0.491071
75% 601.500000 80.000000 100.000000 90.000000 95.000000 90.000000 90.000000 1.416026 91.000000 135.000000 0.717644
max 800.000000 255.000000 190.000000 230.000000 194.000000 200.000000 180.000000 9.000000 152.000000 164.000000 0.984496

In [293]:
wins = np.zeros(shape = (800, 800))

for row in c_df.iterrows():
    if row[1]['First_pokemon'] == row[1]['Winner']:
        wins[row[1]['First_pokemon'] - 1][row[1]['Second_pokemon'] - 1] += 1
    else:
        wins[row[1]['Second_pokemon'] - 1][row[1]['First_pokemon'] - 1] += 1

In [294]:
G = np.zeros(shape = (800, 800))

for i in range(800):
    for j in range(800):
        if wins[i][j] > wins[j][i]:
            G[i][j] = 1
        elif wins[i][j] > wins[j][i]:
            G[j][i] = 1

A = G + (G @ G)

In [310]:
scores = A.sum(axis = 1)
p_df[p_df['pid'].isin(np.argsort(scores)[-10:])]


Out[310]:
pid Name Class 1 Class 2 HP Attack Defense Sp. Atk Sp. Def Speed Legendary Attack/Defense
145 146 Vaporeon Water NaN 130 65 60 110 95 65 False 1.083333
152 153 Kabutops Rock Water 60 115 105 65 70 80 False 1.095238
153 154 Aerodactyl Rock Flying 80 105 65 60 75 130 False 1.615385
161 162 Dragonite Dragon Flying 91 134 95 100 100 80 False 1.410526
162 163 Mewtwo Psychic NaN 106 110 90 154 90 130 True 1.222222
247 248 Houndoom Dark Fire 75 90 50 110 80 95 False 1.800000
392 393 Absol Dark NaN 65 130 60 75 60 75 False 2.166667
425 426 Rayquaza Dragon Flying 105 150 90 150 90 95 True 1.666667
428 429 Deoxys Normal Forme Psychic NaN 50 150 50 150 50 150 True 3.000000
430 431 Deoxys Defense Forme Psychic NaN 50 70 160 70 160 90 True 0.437500

In [ ]: