In [6]:
import sqlite3
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy as sci
import matplotlib.pyplot as plt
import sklearn as skl
import statsmodels.api as sma
import statsmodels as sm
import statsmodels.formula.api as smf
from sklearn import decomposition
import statsmodels.stats.multicomp as comp
import seaborn as sns
%matplotlib inline
In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("./data/alola_db.db")
mons = pd.read_sql_query("SELECT * from POKEDEX", con, coerce_float=False)
con.close()
columnLabels = list(mons.columns)
In [ ]:
mons = mons.drop_duplicates("DEXID", )
In [3]:
mons["AVERAGE_STAT"] = mons["STAT_TOTAL"]/6
gens = pd.Series([0 for i in range(len(mons.index))], index=mons.index)
for ID, mon in mons.iterrows():
if 0<mon.DEXID<=151:
gens[ID] = 1
elif 151<mon.DEXID<=251:
gens[ID] = 2
elif 251<mon.DEXID<=386:
gens[ID] = 3
elif 386<mon.DEXID<=493:
gens[ID] = 4
elif 493<mon.DEXID<=649:
gens[ID] = 5
elif 649<mon.DEXID<=721:
gens[ID] = 6
elif 721<mon.DEXID<=805:
gens[ID] = 7
else:
gens[ID] = 0
mons["GEN"] = gens
mons.to_csv("./data/pokemon_preUSUM_data.csv")
In [4]:
gen = {}
for i in range(1,8):
gen[i] = mons[mons.GEN == i]
In [7]:
plt.figure(100)
colors = sns.color_palette("colorblind", 7)
for i in range(1,8):
sns.distplot( mons[mons["GEN"] == i]["STAT_TOTAL"], hist=False,kde=True, color=colors[i-1], label=f"Gen {i}")
plt.legend()
plt.show()
In [8]:
stat_averages_by_gen = {i:gen[i].AVERAGE_STAT for i in range(1,8)}
testable_data = list(stat_averages_by_gen.values())
data = [list(gen) for gen in testable_data]
data = np.array(data)
In [9]:
averages = {i: stat_averages_by_gen[i].mean() for i in range(1,8)}
averages
Out[9]:
In [10]:
stats.kruskal(*data)
Out[10]:
In [12]:
recarray = mons.to_records()
In [13]:
test = comp.pairwise_tukeyhsd(recarray["AVERAGE_STAT"], recarray["GEN"])
In [14]:
test.summary()
Out[14]:
In [15]:
np.random.seed(525_600)
stats_gens = mons[['HP', 'ATTACK', 'DEFENSE',
'SPECIAL_ATTACK', 'SPECIAL_DEFENSE', 'SPEED', 'GEN']]
X = np.c_[stats_gens]
In [16]:
pca = decomposition.PCA()
pca.fit(X)
Out[16]:
In [17]:
pca.explained_variance_
Out[17]:
In [18]:
pca.n_components = 3
In [19]:
X_reduced = pca.fit_transform(X)
In [20]:
X_reduced.shape
Out[20]:
In [21]:
pca.get_params()
Out[21]:
In [22]:
from sklearn import cluster
k_means = cluster.KMeans(n_clusters = 6)
In [23]:
k_means.fit(X)
Out[23]:
In [24]:
mons["KMEANS_LABEL"] = pd.Series(k_means.labels_)
In [25]:
plotData = mons[["GEN", "STAT_TOTAL", "KMEANS_LABEL"]]
In [31]:
colors = sns.color_palette("colorblind", 7)
for i in range(1,8):
sns.distplot( plotData[plotData["GEN"] == i]["STAT_TOTAL"], color=colors[i-1])
In [32]:
plt.figure(925)
sns.boxplot(x="KMEANS_LABEL", y="STAT_TOTAL", data=plotData)
plt.show()
In [28]:
plt.figure(9050624)
sns.pairplot(plotData, kind="scatter", hue="GEN", palette=colors)
plt.show()
In [ ]:
plotData.to_csv("./data/kmeans.csv")
In [ ]:
In [ ]: