In [11]:
import pandas as pd
data_df = pd.read_csv('https://query.data.world/s/qffqz4g7rjqvfmvuyhlob4obdzdr4q')
In [12]:
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [13]:
data_df.head()
Out[13]:
In [14]:
ax = data_df.target_deathrate.plot(kind='hist', title='death rate')
In [15]:
corrs = data_df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(corrs, ax=ax, linewidths=1)
In [16]:
corrs = data_df.drop('target_deathrate', axis=1).corrwith(data_df.target_deathrate)
fig, ax = plt.subplots(figsize=(15, 10))
corrs.plot(kind='bar', ax=ax, rot=30)
Out[16]:
In [17]:
corrs.abs().nlargest(4)
Out[17]:
In [18]:
top_cols = corrs.abs().nlargest(4).index
for colname in top_cols:
sns.lmplot(colname, 'target_deathrate', data=data_df)
In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10, random_state=12345)
pca = pca.fit(
data_df.drop(['target_deathrate', 'binnedinc',
'geography', 'pctsomecol18_24', 'pctemployed16_over', 'pctprivatecoveragealone'],
axis=1))
ratios = pca.explained_variance_ratio_
ax = pd.Series(ratios).plot(kind='bar')
In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('pca', PCA(n_components=2, random_state=12345)),
('model', LinearRegression())
])
X = data_df.drop(['target_deathrate', 'binnedinc',
'geography', 'pctsomecol18_24', 'pctemployed16_over', 'pctprivatecoveragealone'],
axis=1).values
y = data_df.target_deathrate.values
cv = KFold(n_splits=10, random_state=12345)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')
In [21]:
scores = pd.Series(scores, name='cross validation scores')
ax = scores.plot(kind='bar')
line = ax.axhline(scores.mean(), color='blue')
line = ax.axhline(scores.std(), color='black')
In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('pca', PCA(n_components=2, random_state=12345)),
('model', DecisionTreeRegressor())
])
X = data_df.drop(['target_deathrate', 'binnedinc',
'geography', 'pctsomecol18_24', 'pctemployed16_over', 'pctprivatecoveragealone'],
axis=1).values
y = data_df.target_deathrate.values
cv = KFold(n_splits=10, random_state=12345)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')
In [23]:
scores = pd.Series(scores, name='cross validation scores (decision tree)')
ax = scores.plot(kind='bar')
line = ax.axhline(scores.mean(), color='blue')
line = ax.axhline(scores.std(), color='black')
In [ ]: