In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from scipy import stats, optimize
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.linear_model import Ridge, LassoLars, BayesianRidge, ARDRegression, Lars
from sklearn.linear_model import RANSACRegressor, ElasticNet
from sklearn.linear_model import PassiveAggressiveRegressor, Perceptron
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.svm import LinearSVR
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))
In [2]:
goal_features = ['murders', 'murdPerPop', 'rapes', 'rapesPerPop', 'robberies','robbbPerPop',
'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop',
'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop']
non_predictive_features = ['communityname', 'state', 'countyCode', 'communityCode', 'fold']
In [3]:
df = pd.read_csv('../datasets/UnnormalizedCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in goal_features and x not in non_predictive_features]
len(features)
Out[3]:
In [4]:
def drop_rows_with_null_goal_feature(old_df, feature):
new_df = old_df.dropna(subset=[feature])
return new_df
In [5]:
goal_feature = 'murders'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
print goal_df['murders'].describe()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(goal_df['murders'])
#plt.boxplot(goal_df['murders'], sym='k.', showfliers=True, showmeans=True, showcaps=True, showbox=True)
plt.show()
In [6]:
goal_feature = 'rapes'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
print goal_df[goal_feature].describe()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(goal_df[goal_feature])
#plt.boxplot(goal_df['murders'], sym='k.', showfliers=True, showmeans=True, showcaps=True, showbox=True)
plt.show()
In [7]:
goal_feature = 'robberies'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
print goal_df[goal_feature].describe()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(goal_df[goal_feature])
#plt.boxplot(goal_df['murders'], sym='k.', showfliers=True, showmeans=True, showcaps=True, showbox=True)
plt.show()
In [8]:
goal_feature = 'assaults'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
print goal_df[goal_feature].describe()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(goal_df[goal_feature])
#plt.boxplot(goal_df['murders'], sym='k.', showfliers=True, showmeans=True, showcaps=True, showbox=True)
plt.show()
In [9]:
goal_feature = 'burglaries'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
print goal_df[goal_feature].describe()
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(goal_df[goal_feature])
#plt.boxplot(goal_df['murders'], sym='k.', showfliers=True, showmeans=True, showcaps=True, showbox=True)
plt.show()
In [10]:
clf = Pipeline([
('feature_selection', SelectKBest(k=96, score_func=f_regression)),
('regression', (Ridge()))
])
goal_feature = 'murders'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
goal_df = goal_df[goal_df.murders <= goal_df.murders.quantile(.98)]
print len(goal_df)
#print goal_df.describe()
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);
df_X_train, df_X_test, df_y_train, df_y_test = \
train_test_split(imputed_data, goal_df[goal_feature], test_size=0.3)
mse_cv = cross_val_score(estimator = clf, X=df_X_train, y=df_y_train, scoring='neg_mean_squared_error')
r2_cv = cross_val_score(estimator=clf, X=df_X_train, y=df_y_train, scoring='r2')
print "Cross Validation Score MSE and R_2 are {0} and {1}".format(mse_cv.mean(), r2_cv.mean())
clf.fit(df_X_train, df_y_train)
mse_train = mean_squared_error(df_y_train, clf.predict(df_X_train))
r2_train = r2_score(df_y_train, clf.predict(df_X_train))
print "Training MSE error & R_2 SCore are {0} and {1} ".format(mse_train, r2_train)
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print "Test MSE error & R_2 SCore are {0} and {1} ".format(mse, r2_sc)
In [14]:
clf = Pipeline([
('feature_selection', SelectKBest(k=100, score_func=f_regression)),
('regression', GradientBoostingRegressor())
])
goal_feature = 'rapes'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
goal_df = goal_df[goal_df.murders <= goal_df.murders.quantile(.98)]
print len(goal_df)
#print goal_df.describe()
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);
df_X_train, df_X_test, df_y_train, df_y_test = \
train_test_split(imputed_data, goal_df[goal_feature], test_size=0.3)
mse_cv = cross_val_score(estimator = clf, X=df_X_train, y=df_y_train, scoring='neg_mean_squared_error')
r2_cv = cross_val_score(estimator=clf, X=df_X_train, y=df_y_train, scoring='r2')
print "Cross Validation Score MSE and R_2 are {0} and {1}".format(mse_cv.mean(), r2_cv.mean())
clf.fit(df_X_train, df_y_train)
mse_train = mean_squared_error(df_y_train, clf.predict(df_X_train))
r2_train = r2_score(df_y_train, clf.predict(df_X_train))
print "Training MSE error & R_2 SCore are {0} and {1} ".format(mse_train, r2_train)
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print "Test MSE error & R_2 SCore are {0} and {1} ".format(mse, r2_sc)
In [28]:
clf = Pipeline([
('feature_selection', SelectKBest(k=116, score_func=f_regression)),
('regression', LinearRegression())
])
goal_feature = 'assaults'
goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
#goal_df = goal_df[goal_df.murders <= goal_df.murders.quantile(0.70)]
print len(goal_df)
#print goal_df.describe()
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);
df_X_train, df_X_test, df_y_train, df_y_test = \
train_test_split(imputed_data, goal_df[goal_feature], test_size=0.2)
mse_cv = cross_val_score(estimator = clf, X=df_X_train, y=df_y_train, scoring='neg_mean_squared_error')
r2_cv = cross_val_score(estimator=clf, X=df_X_train, y=df_y_train, scoring='r2')
print "Cross Validation Score MSE and R_2 are {0} and {1}".format(mse_cv.mean(), r2_cv.mean())
clf.fit(df_X_train, df_y_train)
mse_train = mean_squared_error(df_y_train, clf.predict(df_X_train))
r2_train = r2_score(df_y_train, clf.predict(df_X_train))
print df_y_train
print clf.predict(df_X_train)
print "Training MSE error & R_2 SCore are {0} and {1} ".format(mse_train, r2_train)
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print "Test MSE error & R_2 SCore are {0} and {1} ".format(mse, r2_sc)
In [ ]:
In [ ]:
clf.predict(df_X_test)
In [ ]: