Reduce battle data to two dimensions and label the battle outcome (win or lose)
In [383]:
# Imports
import numpy as np
import matplotlib as mpl
import matplotlib.cm as cm
import pandas as pd
import sklearn
# Other Imports
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
# Inline Plotting
import IPython.html.widgets
%matplotlib inline
In [384]:
# Load CSV file into Pandas DataFrame
df = pd.read_csv('data/battles.csv')
df.info()
In [385]:
# Drop Useless Columns
df = df.drop(['name','year','battle_number','attacker_2','attacker_3','attacker_4','defender_2','defender_3', 'defender_4', 'note'], axis=1)
# Drop battles with not enough information
df.dropna(subset=['defender_size', 'attacker_size','attacker_outcome'], axis=0, inplace=True)
In [386]:
# Reorder columns so outcome is last
df = pd.concat([df.ix[:,:4], df.ix[:,5:], df.attacker_outcome], axis=1)
In [387]:
# Encode text columns
column_encoders = {}
for c in list(df.columns):
if df[c].dtype == object:
le = preprocessing.LabelEncoder()
df[c] = le.fit_transform(list(df[c]))
column_encoders[c] = le
df.info()
In [388]:
# Split Data into X and Y vectors, where Y is Outcome
y = df.ix[:,-1].values
x = df.ix[:,:-1].values
# Standardize features by removing the mean and scaling to unit variance
standard_scaler = StandardScaler()
x_std = standard_scaler.fit_transform(x)
In [389]:
# t-distributed Stochastic Neighbor Embedding (t-SNE)
# Compress X vector to two dimensions
tsne = TSNE(n_components=2, random_state=0)
x_1d = tsne.fit_transform(x_std)*10000
In [390]:
# Scatter Plot
plt.figure()
# Create color map and markers for the 2 Outcomes
markers=('s', 'd')
color_map = {0:'red', 1:'blue'}
# Fill Scatter Plot
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=x_1d[y==cl,0], y=x_1d[y==cl,1], c=color_map[idx], marker=markers[idx],
label=column_encoders['attacker_outcome'].inverse_transform(cl))
# Labels and legend
plt.xlabel('X in t-SNE')
plt.ylabel('Y in t-SNE')
plt.legend(loc='upper left', frameon=True)
plt.title('t-SNE visualization of data')
plt.show()
With this format it is easy to see that battles with the same outcome have the similar parameters, although our sample size is small and we did not use dummy variables. Confidence-level is low.
In [ ]: