In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
x = [1.88505201,0.75858685,0.53086046,2.10121118,2.90456146,2.82199243,1.21688824,2.08582494,2.80032271,2.8871096,1.89067363,1.05548585]
y = [1.83256566,0.84474922,0.9779429,1.81776092,0.91189043,0.90186282,1.19189881,1.8977981,1.09191789,1.02681764,2.48316704,1.01289176]
In [2]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
plt.scatter(x,y)
numberOfPoints = len(x)
labels = []
for pt in range(numberOfPoints):
ax.annotate(pt, xy=(x[pt]+0.05, y[pt]+0.05))
labels.append('Point ' + str(pt))
In [3]:
zipped = zip(x, y)
print zipped
In [4]:
df = pd.DataFrame(zip(x, y), columns=['x', 'y'], index = labels)
df
Out[4]:
In [5]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
distMatrix = pd.DataFrame(squareform(pdist(df, metric='euclidean')))
distMatrix
Out[5]:
In [6]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distMatrix, method='average'))
In [7]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
cax = ax.matshow(df, interpolation='nearest', cmap='hot_r')
fig.colorbar(cax)
ticks=np.arange(-.55,12,1)
plt.yticks(ticks)
ax.set_xticklabels([''] + list(df.columns))
ax.set_yticklabels(['']+list(df.index))
plt.show()
In [8]:
# reorder rows with respect to the clustering
df_rowclust = df.ix[R['leaves']]
# plot
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
cax = ax.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
ticks=np.arange(-.55,12,1)
plt.yticks(ticks)
fig.colorbar(cax)
ax.set_xticklabels([''] + list(df_rowclust.columns))
ax.set_yticklabels([''] + list(df_rowclust.index))
plt.show()
In [9]:
# plot dendrogram
fig = plt.figure(figsize=(8,8))
axd = fig.add_axes([0.1,0.125,0.2,0.6])
row_dendr = dendrogram(linkage(distMatrix, method='average'),orientation = 'right') # makes dendrogram black (2))
axd.set_xticks([])
axd.set_yticks([])
# Remove axes spines from dendrogram
for i in axd.spines.values():
i.set_visible(False)
# Plot heatmap
axm = fig.add_axes([0.005,0.1,0.6,0.6]) # x-pos, y-pos, width, height
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r', origin='lower')
fig.colorbar(cax)
ticks=np.arange(-.55,12,1)
plt.yticks(ticks)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
plt.show()
In [10]:
import seaborn as sns
sns.clustermap(df,method='average')
Out[10]:
In [11]:
import pandas as pd
data = pd.read_csv('BasketBallStats.csv', index_col=0)
data.index = data.index.map(lambda x: x.strip())
# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = ['Games', 'Minutes', 'Points', 'Field goals made',
'Field goal attempts', 'Field goal percentage', 'Free throws made',
'Free throws attempts', 'Free throws percentage',
'Three-pointers made', 'Three-point attempt',
'Three-point percentage', 'Offensive rebounds', 'Defensive rebounds',
'Total rebounds', 'Assists', 'Steals', 'Blocks', 'Turnover',
'Personal foul']
data.columns = labels
It is important to standardize data when features have values that are disproportionate to other features (e.g., values of "Games" and "Minutes" are so much larger than the others).
One way to standardize data is to subtract the mean and divide by the standard deviation. This is called a z-score.
In [12]:
data_normalized = data
data_normalized = (data_normalized - data_normalized.mean())/data_normalized.std()
In [13]:
# Your code goes here
In [ ]:
# Your code goes here