The prostate cancer data set is used as the main analysis example for regression techniques in the text, Elements of Statistical Learning (ESL). My goal here is to reproduce a few of the results and try out the different regression techniques.
The data source is http://statweb.stanford.edu/~tibs/ElemStatLearn/data.html
The ESL text is located at http://statweb.stanford.edu/~tibs/ElemStatLearn/printings/ESLII_print10.pdf
In [ ]:
import pandas as pd
import numpy as np
from pandas.tools.plotting import scatter_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
For prettier graphs (according to my aesthetic preferences), I do...
In [ ]:
# use ggplot-like styling
plt.style.use('ggplot') # requires matplotlib v1.4
mpl.rcParams['font.family'] = 'Ubuntu Mono' # use monospace font
%matplotlib inline
In [ ]:
# todo: change this to use urllib2, instead of local file
# info... http://stackoverflow.com/questions/1393324/in-python-given-a-url-to-a-text-file-what-is-the-simplest-way-to-read-the-cont
dataURL = 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data'
df = pd.read_csv(dataURL,sep='\t',index_col=0)
In [ ]:
df['Intercept'] = 1
df.head()
Next we make scatter plot matrices, which are a nice and informative data visualization. This can be implemented in various ways:
scatter_matrix method from pandas.tools.plottingpairplot method from seabornThe first step is to prepare a pandas DataFrame containing the data features we want to plot, and in what order.
In [ ]:
featuresToPlot = ['lpsa','lcavol','lweight','age','lbph','svi','lcp','gleason','pgg45']
# featuresToPlot = ['lpsa','lcavol','lweight','age','lbph']
# featuresToPlot = ['age','lbph']
dfToPlot = df[featuresToPlot]
Here we implement #1, using scatter_matrix from pandas.tools.plotting.
In [ ]:
# create figure with plots
pd.tools.plotting.scatter_matrix(dfToPlot, alpha=0.7, diagonal='kde',figsize=(9,9))
plt.savefig('prostate_scatterplot_matrix_pandas.svg')
plt.show()
Next we implement #2, using seaborn's pairplot method
In [ ]:
sns.set()
sns.pairplot(dfToPlot, size=1.5)
plt.savefig('prostate_scatterplot_matrix_seaborn.svg')
plt.show()
In [ ]:
# featuresToPlot = ['lpsa','lcavol','lweight','age','lbph','svi','lcp','gleason','pgg45']
# featuresToPlot = ['lbph','svi']
featuresToPlot = ['lpsa','lcavol','lweight','age','lbph','lcp','gleason','pgg45']
dfToPlot = df[featuresToPlot]
sns.set()
# make a grid object
g = sns.PairGrid(dfToPlot, diag_sharey=False,size=7)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=3)
plt.savefig('test5.svg')
plt.show()
In [ ]:
# featuresToPlot = ['lpsa','lcavol','lweight','age','lbph','svi','lcp','gleason','pgg45']
featuresToPlot = ['lbph','svi']
# featuresToPlot = ['lpsa','lcavol','lweight','age','lbph','lcp','gleason','pgg45']
dfToPlot = df[featuresToPlot]
sns.set()
# make a grid object
g = sns.PairGrid(dfToPlot, diag_sharey=False,size=7)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=3)
plt.savefig('test5.svg')
plt.show()
In [ ]:
cov_test = np.array([[1,0.999],
[0.999,1]
])
print cov_test.__class__
print cov_test.shape
print np.linalg.eigvals(cov_test)
In [ ]:
data_svi = df['svi'].transpose()
data_lbph = df['lbph'].transpose()
c1 = np.cov(m=data_svi,y=data_lbph)
print c1
print np.linalg.eigvals(c1)
c2 = np.cov(m=df[['svi','lbph']].transpose())
print c2
print np.linalg.eigvals(c2)
In [112]:
sns.set()
x = df['svi']
y = df['lbph']
print df['svi'].values.shape
# sm.nonparametric.KDEMultivariate?
# kde = sm.nonparametric.KDEMultivariate([x, y], "cc")
# print kde
data = df[['lbph','svi']]
x = data.iloc[:, 0].values
print x.shape
if isinstance(data, pd.DataFrame) and np.ndim(data) > 1: print True
# sns.kdeplot(data)
# plt.show()
In [101]:
import statsmodels.api as sm
In [ ]:
df[['svi','lbph']]
In [ ]:
print df.lbph.value_counts(dropna=False,bins = 2)
In [ ]: