In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max.columns', None)
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_curve, f1_score, accuracy_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
%matplotlib inline
In [2]:
df = pd.read_csv('Pokemon.csv', low_memory=False)
In [3]:
df.info()
In [4]:
df.head()
Out[4]:
In [5]:
print(str(len(df[df['Legendary'] == True]) / len(df) * 100) + '%')
In [6]:
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 1'])
Out[6]:
In [7]:
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 2'])
Out[7]:
In [8]:
sns.distplot(df['Total'])
Out[8]:
In [9]:
df['Generation'].value_counts()
Out[9]:
In [10]:
sns.pairplot(df[['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']])
Out[10]:
In [11]:
corr = df.corr()
In [12]:
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', annot=True)
Out[12]:
In [13]:
df.describe()
Out[13]:
In [14]:
df[df['Name'].duplicated()] # no dupliactes
Out[14]:
In [15]:
pd.crosstab(df['Type 1'] , df['Legendary'])
Out[15]:
In [16]:
for i in df.columns:
print(i, len(df[i].unique()))
In [17]:
df['Legendary'] = df['Legendary'].apply(lambda x: 1 if x == True else 0)
In [18]:
dataset = df.iloc[:, 2:]
In [19]:
dataset.head()
Out[19]:
In [20]:
dataset = pd.get_dummies(dataset, dummy_na=True,drop_first=True)
dataset['Target'] = dataset['Legendary']
dataset.drop(['Legendary', 'Total'], inplace=True, axis=1)
In [21]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
In [22]:
y.head(2)
Out[22]:
In [23]:
X.head()
Out[23]:
In [24]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
In [25]:
X_train.shape
Out[25]:
In [26]:
X_test.shape
Out[26]:
In [27]:
y_train.shape
Out[27]:
In [28]:
y_test.shape
Out[28]:
In [29]:
clr = LogisticRegression()
In [30]:
clr.fit(X_train, y_train)
Out[30]:
In [31]:
y_pred = clr.predict(X_test)
In [32]:
accuracy_score(y_test, y_pred)
Out[32]:
In [33]:
print(classification_report(y_test, y_pred))
In [34]:
cm = confusion_matrix(y_test, y_pred)
In [35]:
cm
Out[35]:
In [36]:
probs = clr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
In [37]:
## SVC
In [38]:
svc = SVC(probability=True)
In [39]:
svc.fit(X_train, y_train)
Out[39]:
In [49]:
svc_probs = svc.predict_proba(X_test)
svc_preds = svc_probs[:,1]
svc_fpr, svc_tpr, svc_threshold = metrics.roc_curve(y_test, svc_preds)
svc_roc_auc = metrics.auc(svc_fpr, svc_tpr)
In [50]:
svc_y_pred = svc.predict(X_test)
In [51]:
accuracy_score(y_test, svc_y_pred)
Out[51]:
In [52]:
tpr
Out[52]:
In [53]:
svc_tpr
Out[53]:
In [54]:
cm
Out[54]:
In [55]:
svc_cm = confusion_matrix(y_test, svc_y_pred)
In [56]:
svc_cm
Out[56]:
In [57]:
plt.title('Receiver Operating Characteristic')
plt.plot(svc_fpr, svc_tpr, 'b', label = 'SVC AUC = %0.2f' % svc_roc_auc)
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Out[57]: