In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
sns.set_context('notebook', font_scale=1.5)
1. (40 points) Read in the CSV file pokemon.csv in the local directory (Source: Kaggle). Do the following:
Type 2 without creating a copy of the data frame i.e. in-place (5 points)Speed in-place (5 points)value = 3*HP + 2*Attack + 1*Defense (5 points)Forme in the Name column in-place (5 points)Attack and Defense attributes of all the Type 1 AND Generation subgroups. For instance, one such group would be (Grass, 1). (10 points)Note: If you change the data frame, print out the first 3 rows after each change with the head method.
In [4]:
# Your answer here
In [7]:
pd.read_csv('pokemon.csv').head(3)
Out[7]:
In [6]:
df1 = pd.read_csv('pokemon.csv')
df1.head(3)
Out[6]:
In [7]:
df1.shape
Out[7]:
In [8]:
df1.drop('#', axis=1, inplace=True)
df1.head(3)
Out[8]:
In [9]:
df1.sample(3)
Out[9]:
In [10]:
df1.sort_values('Speed', ascending=False, inplace=True)
df1.head(3)
Out[10]:
In [11]:
df1['Value'] = 3*df1['HP'] + 2*df1['Attack'] + 1*df1['Defense']
df1.head(3)
Out[11]:
In [12]:
rows = df1[df1['Name'].str.contains('Forme')].index
df1.drop(rows, axis=0, inplace=True)
df1.head(3)
Out[12]:
In [13]:
grouped = df1.groupby(['Type 1', 'Generation'])
summary = grouped[['Attack', 'Defense']].agg(['mean', 'var'])
summary.head(10)
Out[13]:
2. (30 points) Using the same Pokemon data frame, do the following:
Name, Type 1, Generation, Feature, Score where Name, Type 1, Generation have the same meaning as in the original data frame, Feature is a column containing one of the following strings HP, Attack, Defense, Sp. Atk, Sp. Def, Speed and Score is the numerical value of the feature. This is known as going from wide-to-tall formats. In R, this operation can be done using the gatehr function from the tidyr package. (10 points)seaborn package, create a grid of box plots where the x-axis the Features, the y-axis shows the 'Score', the rows are the Type 1 values, and the columns are the Generation values. (10 points)seaborn, make a cluster map showing the mean values of HP, Attack, Defense, Sp. Atk, Sp. Def and Speed for each Type 1 Pokemon. Rotate the Type 1 lables so they are readable. (10 points)
In [14]:
# Your answer here
In [9]:
df1a = pd.melt(df1, id_vars=['Name', 'Type 1', 'Generation'],
value_vars=['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'],
var_name='Feature', value_name='Score')
df1a.head(3)
Out[9]:
In [16]:
sns.factorplot(x='Feature', y='Score', row='Type 1', col='Generation',
kind='box', data=df1a)
pass
In [17]:
df1b = df1.groupby('Type 1')['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'].mean()
cg = sns.clustermap(df1b)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
pass
3. (30 points) Read in the CSV file pokemonGo.csv in the local directory (Source: Kaggle). Do the following:
pokemon.csv and pokemonGO.csv files. Drop any row that does not have Name, Type 1 and Type 2 values that are exactly the same in both data frames. (10 points)
In [10]:
# Your answer here
In [11]:
df2 = pd.read_csv('pokemonGO.csv')
df2.head(3)
Out[11]:
In [12]:
df = pd.merge(df1, df2)
df.head(3)
Out[12]:
In [13]:
import os
import urllib
In [14]:
urls = df[df['Speed'] > 120]['Image URL']
for url in urls:
name = os.path.split(url)[1]
urllib.request.urlretrieve(url, name)
In [15]:
from IPython.display import display, Image
In [16]:
for url in urls:
name = os.path.split(url)[1]
display(Image(name, width=100))
In [17]:
from scipy.misc import imread
In [18]:
imgs = []
for url in urls:
name = os.path.split(url)[1]
imgs.append(imread(name))
In [19]:
sns.set_style('white')
fig, axes = plt.subplots(1, 4)
for i, img in enumerate(imgs):
axes[i].imshow(img)
axes[i].set_xticks([])
axes[i].set_yticks([])
In [ ]: