In [1]:
using DataFrames

# load the table
df = readtable(joinpath(Pkg.dir("Plots"), "examples", "meetup", "nba_2013.csv"))
size(df)


Out[1]:
(481,31)

In [2]:
# show the first row
df[1,:]


Out[2]:
playerposagebref_team_idggsmpfgfgafg_x3px3pax3p_x2px2pax2p_efg_ftftaft_orbdrbtrbaststlblktovpfptsseasonseason_end
1Quincy AcySF23TOT630847661410.4684150.266666666666667621260.4920634920634920.48235530.6672144216282326301221712013-20142013

In [3]:
# get the header names
nms = names(df)


Out[3]:
31-element Array{Symbol,1}:
 :player      
 :pos         
 :age         
 :bref_team_id
 :g           
 :gs          
 :mp          
 :fg          
 :fga         
 :fg_         
 :x3p         
 :x3pa        
 :x3p_        
 ⋮            
 :ft_         
 :orb         
 :drb         
 :trb         
 :ast         
 :stl         
 :blk         
 :tov         
 :pf          
 :pts         
 :season      
 :season_end  

In [16]:
# many algorithms with online (streaming) implementations
using OnlineStats

# create a matrix from the columns of the dataframe
cnames = [:x3p,:stl,:trb,:age,:pts]
M = Matrix{Float64}(df[cnames])

# compute the covariance (uses EqualWeighting by default... should match classic covariance calculations)
C = CovarianceMatrix(M)


Out[16]:
OnlineStat: OnlineStats.CovarianceMatrix{OnlineStats.EqualWeighting}
 *        μ:  [39.61330561330561,39.28066528066528,218.62785862785864,26.50935550935551,516.5821205821206]
 *        Σ:  5x5 Array{Float64,2}:
  2586.3      1047.38      1863.08     10.8057   15678.2      
  1047.38     1209.9       4028.25     -4.13492  13048.6      
  1863.08     4028.25     40142.7     -11.3142   68420.4      
    10.8057     -4.13492    -11.3142   17.6254     -23.5221   
 15678.2     13048.6      68420.4     -23.5221       2.21297e5
 *     nobs:  481

In [17]:
# choose the package Gadfly as the backend for Plots
using Plots
gadfly()

# do a correlation scatter plot
corrplot(M, cor(C), labels = cnames)


[Plots.jl] Switched to backend: gadfly
Out[17]:


In [18]:
# plot players by cluster
using Clustering
kmmodel = kmeans(M', 5)


Out[18]:
Clustering.KmeansResult{Float64}(5x5 Array{Float64,2}:
  128.583    9.04124   81.55     32.6316   36.8947
   92.0556  10.6495    69.075    65.5526   41.3308
  400.528   57.5825   289.275   643.316   240.466 
   26.0     26.3814    26.9      25.7895   26.8045
 1567.75    99.5412   883.688  1092.58    454.985 ,[2,5,5,1,5,2,1,5,3,5  …  5,2,5,2,5,3,3,1,5,5],[30391.9,45882.0,14602.5,78974.7,19160.4,5248.01,150702.0,29600.3,42014.3,5721.47  …  10229.1,1858.62,6363.53,4165.91,16355.7,8667.77,86349.9,35528.0,15286.6,6772.79],[36,194,80,38,133],[36.0,194.0,80.0,38.0,133.0],1.5403008988576995e7,6,true)

In [26]:
# reduce to 2 dimensions
using MultivariateStats
pcamodel = pca(C, maxoutdim=2)
xy = transform(pcamodel, M')'
x, y = xy[:,1], xy[:,2];

# plot a scatter plot, grouping by the clusters obtained in kmeans
scatter(x, y, group = assignments(kmmodel), marker=:auto)


Out[26]:

In [ ]: