In [1]:
using DataFrames
# load the table
df = readtable(joinpath(Pkg.dir("Plots"), "examples", "meetup", "nba_2013.csv"))
size(df)
Out[1]:
In [2]:
# show the first row
df[1,:]
Out[2]:
In [3]:
# get the header names
nms = names(df)
Out[3]:
In [16]:
# many algorithms with online (streaming) implementations
using OnlineStats
# create a matrix from the columns of the dataframe
cnames = [:x3p,:stl,:trb,:age,:pts]
M = Matrix{Float64}(df[cnames])
# compute the covariance (uses EqualWeighting by default... should match classic covariance calculations)
C = CovarianceMatrix(M)
Out[16]:
In [17]:
# choose the package Gadfly as the backend for Plots
using Plots
gadfly()
# do a correlation scatter plot
corrplot(M, cor(C), labels = cnames)
Out[17]:
In [18]:
# plot players by cluster
using Clustering
kmmodel = kmeans(M', 5)
Out[18]:
In [26]:
# reduce to 2 dimensions
using MultivariateStats
pcamodel = pca(C, maxoutdim=2)
xy = transform(pcamodel, M')'
x, y = xy[:,1], xy[:,2];
# plot a scatter plot, grouping by the clusters obtained in kmeans
scatter(x, y, group = assignments(kmmodel), marker=:auto)
Out[26]:
In [ ]: