notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max.columns', None)
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_curve, f1_score, accuracy_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%matplotlib inline









    



D:\WorkArea\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
df = pd.read_csv('Pokemon.csv', low_memory=False)



In [3]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
#             800 non-null int64
Name          800 non-null object
Type 1        800 non-null object
Type 2        414 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB



In [4]:

    
df.head()









    Out[4]:







  
    
      
      #
      Name
      Type 1
      Type 2
      Total
      HP
      Attack
      Defense
      Sp. Atk
      Sp. Def
      Speed
      Generation
      Legendary
    
  
  
    
      0
      1
      Bulbasaur
      Grass
      Poison
      318
      45
      49
      49
      65
      65
      45
      1
      False
    
    
      1
      2
      Ivysaur
      Grass
      Poison
      405
      60
      62
      63
      80
      80
      60
      1
      False
    
    
      2
      3
      Venusaur
      Grass
      Poison
      525
      80
      82
      83
      100
      100
      80
      1
      False
    
    
      3
      3
      VenusaurMega Venusaur
      Grass
      Poison
      625
      80
      100
      123
      122
      120
      80
      1
      False
    
    
      4
      4
      Charmander
      Fire
      NaN
      309
      39
      52
      43
      60
      50
      65
      1
      False



In [5]:

    
print(str(len(df[df['Legendary'] == True]) / len(df) * 100) + '%')



In [6]:

    
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 1'])









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x178c9d395f8>



In [7]:

    
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 2'])









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x178c9fb76d8>



In [8]:

    
sns.distplot(df['Total'])









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x178ca042ac8>



In [9]:

    
df['Generation'].value_counts()









    Out[9]:





1    166
5    165
3    160
4    121
2    106
6     82
Name: Generation, dtype: int64



In [10]:

    
sns.pairplot(df[['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']])









    Out[10]:





<seaborn.axisgrid.PairGrid at 0x178ca4b1320>



In [11]:

    
corr = df.corr()



In [12]:

    
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', annot=True)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x178cbe59cc0>



In [13]:

    
df.describe()



In [14]:

    
df[df['Name'].duplicated()] # no dupliactes









    Out[14]:







  
    
      
      #
      Name
      Type 1
      Type 2
      Total
      HP
      Attack
      Defense
      Sp. Atk
      Sp. Def
      Speed
      Generation
      Legendary



In [15]:

    
pd.crosstab(df['Type 1'] , df['Legendary'])



In [16]:

    
for i in df.columns:
    print(i, len(df[i].unique()))









    



# 721
Name 800
Type 1 18
Type 2 19
Total 200
HP 94
Attack 111
Defense 103
Sp. Atk 105
Sp. Def 92
Speed 108
Generation 6
Legendary 2



In [17]:

    
df['Legendary'] = df['Legendary'].apply(lambda x: 1 if x == True else 0)



In [18]:

    
dataset = df.iloc[:, 2:]



In [19]:

    
dataset.head()



In [20]:

    
dataset = pd.get_dummies(dataset, dummy_na=True,drop_first=True)
dataset['Target'] = dataset['Legendary']
dataset.drop(['Legendary', 'Total'], inplace=True, axis=1)



In [21]:

    
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]



In [22]:

    
y.head(2)









    Out[22]:





0    0
1    0
Name: Target, dtype: int64



In [23]:

    
X.head()









    Out[23]:







  
    
      
      HP
      Attack
      Defense
      Sp. Atk
      Sp. Def
      Speed
      Generation
      Type 1_Dark
      Type 1_Dragon
      Type 1_Electric
      Type 1_Fairy
      Type 1_Fighting
      Type 1_Fire
      Type 1_Flying
      Type 1_Ghost
      Type 1_Grass
      Type 1_Ground
      Type 1_Ice
      Type 1_Normal
      Type 1_Poison
      Type 1_Psychic
      Type 1_Rock
      Type 1_Steel
      Type 1_Water
      Type 1_nan
      Type 2_Dark
      Type 2_Dragon
      Type 2_Electric
      Type 2_Fairy
      Type 2_Fighting
      Type 2_Fire
      Type 2_Flying
      Type 2_Ghost
      Type 2_Grass
      Type 2_Ground
      Type 2_Ice
      Type 2_Normal
      Type 2_Poison
      Type 2_Psychic
      Type 2_Rock
      Type 2_Steel
      Type 2_Water
      Type 2_nan
    
  
  
    
      0
      45
      49
      49
      65
      65
      45
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      1
      60
      62
      63
      80
      80
      60
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      2
      80
      82
      83
      100
      100
      80
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      3
      80
      100
      123
      122
      120
      80
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      4
      39
      52
      43
      60
      50
      65
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1



In [24]:

    
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)



In [25]:

    
X_train.shape









    Out[25]:





(640, 43)



In [26]:

    
X_test.shape









    Out[26]:





(160, 43)



In [27]:

    
y_train.shape









    Out[27]:





(640,)



In [28]:

    
y_test.shape









    Out[28]:





(160,)



In [29]:

    
clr = LogisticRegression()



In [30]:

    
clr.fit(X_train, y_train)









    Out[30]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [31]:

    
y_pred = clr.predict(X_test)



In [32]:

    
accuracy_score(y_test, y_pred)









    Out[32]:





0.9375



In [33]:

    
print(classification_report(y_test, y_pred))









    



             precision    recall  f1-score   support

          0       0.94      1.00      0.97       146
          1       1.00      0.29      0.44        14

avg / total       0.94      0.94      0.92       160



In [34]:

    
cm = confusion_matrix(y_test, y_pred)



In [35]:

    
cm









    Out[35]:





array([[146,   0],
       [ 10,   4]], dtype=int64)



In [36]:

    
probs = clr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)



In [37]:

    
## SVC



In [38]:

    
svc = SVC(probability=True)



In [39]:

    
svc.fit(X_train, y_train)









    Out[39]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [49]:

    
svc_probs = svc.predict_proba(X_test)
svc_preds = svc_probs[:,1]
svc_fpr, svc_tpr, svc_threshold = metrics.roc_curve(y_test, svc_preds)
svc_roc_auc = metrics.auc(svc_fpr, svc_tpr)



In [50]:

    
svc_y_pred = svc.predict(X_test)



In [51]:

    
accuracy_score(y_test, svc_y_pred)









    Out[51]:





0.91249999999999998



In [52]:

    
tpr









    Out[52]:





array([ 0.07142857,  0.64285714,  0.64285714,  0.71428571,  0.71428571,
        0.78571429,  0.78571429,  0.85714286,  0.85714286,  0.92857143,
        0.92857143,  1.        ,  1.        ])



In [53]:

    
svc_tpr









    Out[53]:





array([ 0.        ,  0.        ,  0.14285714,  0.14285714,  0.21428571,
        0.21428571,  0.28571429,  0.28571429,  0.5       ,  0.5       ,
        0.78571429,  0.78571429,  0.78571429,  0.85714286,  0.85714286,
        0.92857143,  0.92857143,  1.        ,  1.        ])



In [54]:

    
cm









    Out[54]:





array([[146,   0],
       [ 10,   4]], dtype=int64)



In [55]:

    
svc_cm = confusion_matrix(y_test, svc_y_pred)



In [56]:

    
svc_cm









    Out[56]:





array([[146,   0],
       [ 14,   0]], dtype=int64)



In [57]:

    
plt.title('Receiver Operating Characteristic')
plt.plot(svc_fpr, svc_tpr, 'b', label = 'SVC AUC = %0.2f' % svc_roc_auc)
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')









    Out[57]:





<matplotlib.text.Text at 0x178cd4d7748>

	#	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation
count	800.000000	800.00000	800.000000	800.000000	800.000000	800.000000	800.000000	800.000000	800.00000
mean	362.813750	435.10250	69.258750	79.001250	73.842500	72.820000	71.902500	68.277500	3.32375
std	208.343798	119.96304	25.534669	32.457366	31.183501	32.722294	27.828916	29.060474	1.66129
min	1.000000	180.00000	1.000000	5.000000	5.000000	10.000000	20.000000	5.000000	1.00000
25%	184.750000	330.00000	50.000000	55.000000	50.000000	49.750000	50.000000	45.000000	2.00000
50%	364.500000	450.00000	65.000000	75.000000	70.000000	65.000000	70.000000	65.000000	3.00000
75%	539.250000	515.00000	80.000000	100.000000	90.000000	95.000000	90.000000	90.000000	5.00000
max	721.000000	780.00000	255.000000	190.000000	230.000000	194.000000	230.000000	180.000000	6.00000

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False

Legendary	False	True
Type 1
Bug	69	0
Dark	29	2
Dragon	20	12
Electric	40	4
Fairy	16	1
Fighting	27	0
Fire	47	5
Flying	2	2
Ghost	30	2
Grass	67	3
Ground	28	4
Ice	22	2
Normal	96	2
Poison	28	0
Psychic	43	14
Rock	40	4
Steel	23	4
Water	108	4

	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Type 1_Fire	Type 1_Grass	Type 2_Poison	Type 2_nan
0	45	49	49	65	65	45	1	0	1	1	0
1	60	62	63	80	80	60	1	0	1	1	0
2	80	82	83	100	100	80	1	0	1	1	0
3	80	100	123	122	120	80	1	0	1	1	0
4	39	52	43	60	50	65	1	1	0	0	1

	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Type 1_Fire	Type 1_Grass	Type 2_Poison	Type 2_nan
0	45	49	49	65	65	45	1	0	1	1	0
1	60	62	63	80	80	60	1	0	1	1	0
2	80	82	83	100	100	80	1	0	1	1	0
3	80	100	123	122	120	80	1	0	1	1	0
4	39	52	43	60	50	65	1	1	0	0	1

	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Type 1_Fire	Type 1_Grass	Type 2_Poison	Type 2_nan
0	45	49	49	65	65	45	1	0	1	1	0
1	60	62	63	80	80	60	1	0	1	1	0
2	80	82	83	100	100	80	1	0	1	1	0
3	80	100	123	122	120	80	1	0	1	1	0
4	39	52	43	60	50	65	1	1	0	0	1