In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
sns.set_context('notebook', font_scale=1.5)
1. (40 points) Read in the CSV file pokemon.csv
in the local directory (Source: Kaggle). Do the following:
Type 2
without creating a copy of the data frame i.e. in-place (5 points)Speed
in-place (5 points)value = 3*HP + 2*Attack + 1*Defense
(5 points)Forme
in the Name
column in-place (5 points)Attack
and Defense
attributes of all the Type 1 AND Generation subgroups. For instance, one such group would be (Grass, 1). (10 points)Note: If you change the data frame, print out the first 3 rows after each change with the head
method.
In [4]:
# Your answer here
In [7]:
pd.read_csv('pokemon.csv').head(3)
Out[7]:
In [6]:
df1 = pd.read_csv('pokemon.csv')
df1.head(3)
Out[6]:
In [7]:
df1.shape
Out[7]:
In [8]:
df1.drop('#', axis=1, inplace=True)
df1.head(3)
Out[8]:
In [9]:
df1.sample(3)
Out[9]:
In [10]:
df1.sort_values('Speed', ascending=False, inplace=True)
df1.head(3)
Out[10]:
In [11]:
df1['Value'] = 3*df1['HP'] + 2*df1['Attack'] + 1*df1['Defense']
df1.head(3)
Out[11]:
In [12]:
rows = df1[df1['Name'].str.contains('Forme')].index
df1.drop(rows, axis=0, inplace=True)
df1.head(3)
Out[12]:
In [13]:
grouped = df1.groupby(['Type 1', 'Generation'])
summary = grouped[['Attack', 'Defense']].agg(['mean', 'var'])
summary.head(10)
Out[13]:
2. (30 points) Using the same Pokemon data frame, do the following:
Name
, Type 1
, Generation
, Feature
, Score
where Name
, Type 1
, Generation
have the same meaning as in the original data frame, Feature
is a column containing one of the following strings HP
, Attack
, Defense
, Sp. Atk
, Sp. Def
, Speed
and Score
is the numerical value of the feature. This is known as going from wide-to-tall formats. In R, this operation can be done using the gatehr
function from the tidyr
package. (10 points)seaborn
package, create a grid of box plots where the x-axis the Features, the y-axis shows the 'Score', the rows are the Type 1 values, and the columns are the Generation values. (10 points)seaborn
, make a cluster map
showing the mean values of HP
, Attack
, Defense
, Sp. Atk
, Sp. Def
and Speed
for each Type 1
Pokemon. Rotate the Type 1 lables so they are readable. (10 points)
In [14]:
# Your answer here
In [9]:
df1a = pd.melt(df1, id_vars=['Name', 'Type 1', 'Generation'],
value_vars=['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'],
var_name='Feature', value_name='Score')
df1a.head(3)
Out[9]:
In [16]:
sns.factorplot(x='Feature', y='Score', row='Type 1', col='Generation',
kind='box', data=df1a)
pass
In [17]:
df1b = df1.groupby('Type 1')['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'].mean()
cg = sns.clustermap(df1b)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
pass
3. (30 points) Read in the CSV file pokemonGo.csv
in the local directory (Source: Kaggle). Do the following:
pokemon.csv
and pokemonGO.csv
files. Drop any row that does not have Name
, Type 1
and Type 2
values that are exactly the same in both data frames. (10 points)
In [10]:
# Your answer here
In [11]:
df2 = pd.read_csv('pokemonGO.csv')
df2.head(3)
Out[11]:
In [12]:
df = pd.merge(df1, df2)
df.head(3)
Out[12]:
In [13]:
import os
import urllib
In [14]:
urls = df[df['Speed'] > 120]['Image URL']
for url in urls:
name = os.path.split(url)[1]
urllib.request.urlretrieve(url, name)
In [15]:
from IPython.display import display, Image
In [16]:
for url in urls:
name = os.path.split(url)[1]
display(Image(name, width=100))
In [17]:
from scipy.misc import imread
In [18]:
imgs = []
for url in urls:
name = os.path.split(url)[1]
imgs.append(imread(name))
In [19]:
sns.set_style('white')
fig, axes = plt.subplots(1, 4)
for i, img in enumerate(imgs):
axes[i].imshow(img)
axes[i].set_xticks([])
axes[i].set_yticks([])
In [ ]: