In [1]:
%pylab inline
Communities in the US. Data combines socio-economic data from the '90 Census, law enforcement data from the 1990 Law Enforcement Management and Admin Stats survey, and crime data from the 1995 FBI UCR
This dataset consists of 2215 instances of crimes that has been reported from across all the states in the United States of America
Total number of features = 147
5 - Non-predictive features
-- communityname: Community name - not predictive - for information only (string)
-- state: US state (by 2 letter postal abbreviation)(nominal)
-- countyCode: numeric code for county - not predictive, and many missing values (numeric)
-- communityCode: numeric code for community - not predictive and many missing values (numeric)
-- fold: fold number for non-random 10 fold cross validation, potentially useful for debugging, paired tests - not predictive (numeric - integer)
124 - Predictive features : More details on these can be found here
18 potential goal features which are listed below
-- murders: number of murders in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- murdPerPop: number of murders per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- rapes: number of rapes in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- rapesPerPop: number of rapes per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- robberies: number of robberies in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- robbbPerPop: number of robberies per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- assaults: number of assaults in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- assaultPerPop: number of assaults per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- burglaries: number of burglaries in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- burglPerPop: number of burglaries per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- larcenies: number of larcenies in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- larcPerPop: number of larcenies per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- autoTheft: number of auto thefts in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- autoTheftPerPop: number of auto thefts per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- arsons: number of arsons in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
-- arsonsPerPop: number of arsons per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
-- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted)
-- nonViolPerPop: total number of non-violent crimes per 100K popuation (numeric - decimal) potential GOAL attribute (to be predicted)
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from scipy import stats, optimize
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))
In [3]:
goal_features = ['murders', 'murdPerPop', 'rapes', 'rapesPerPop', 'robberies','robbbPerPop',
'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop',
'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop']
non_predictive_features = ['communityname', 'state', 'countyCode', 'communityCode', 'fold']
In [4]:
df = pd.read_csv('../datasets/UnnormalizedCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in goal_features and x not in non_predictive_features]
len(features)
Out[4]:
In [ ]:
df.isnull().sum()
In [ ]:
df[goal_features].isnull().sum()
In [5]:
def drop_rows_with_null_goal_feature(old_df, feature):
new_df = old_df.dropna(subset=[feature])
return new_df
In [6]:
missing_smaples_df = df.dropna()
Now, that we have dropped all the rows with missing values, let us calculate the cross validated scores for all the goals
In [7]:
estimator = LinearRegression()
In [8]:
#estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, missing_smaples_df[features], missing_smaples_df['murders']).mean()
print("Score after dropping all the rows with missing values for murders = %.2f" % score)
In [9]:
#estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, missing_smaples_df[features], missing_smaples_df['murdPerPop']).mean()
print("Score after dropping all the rows with missing values for murdPerPop = %.2f" % score)
In [10]:
#estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, missing_smaples_df[features], missing_smaples_df['rapes']).mean()
print("Score after dropping all the rows with missing values for rapes = %.2f" % score)
In [11]:
#estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, missing_smaples_df[features], missing_smaples_df['rapesPerPop']).mean()
print("Score after dropping all the rows with missing values for rapesPerPop = %.2f" % score)
In [12]:
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("scaler", StandardScaler()),
("linearRegression", LinearRegression())])
In [13]:
murders_df = drop_rows_with_null_goal_feature(df, 'murders')
score = cross_val_score(estimator, murders_df[features], murders_df['murders']).mean()
print("Score after imputation of the missing values = %.2f" % score)
In [14]:
murders_perpop_df = drop_rows_with_null_goal_feature(df, 'murdPerPop')
score = cross_val_score(estimator, murders_perpop_df[features], murders_perpop_df['murdPerPop']).mean()
print("Score after imputation of the missing values = %.2f" % score)
In [15]:
rapes_df = drop_rows_with_null_goal_feature(df, 'rapes')
score = cross_val_score(estimator, rapes_df[features], rapes_df['rapes']).mean()
print("Score after imputation of the missing values = %.2f" % score)
In [16]:
rapes_perpop_df = drop_rows_with_null_goal_feature(df, 'rapesPerPop')
score = cross_val_score(estimator, rapes_perpop_df[features], rapes_perpop_df['rapesPerPop']).mean()
print("Score after imputation of the missing values = %.2f" % score)
In [42]:
def plot_RFECV_scores(estimator, list_of_goal_features, df, score):
rfecv = RFECV(estimator=estimator, step=1, scoring=score)
for goal in list_of_goal_features:
final_features = []
filtered_df = drop_rows_with_null_goal_feature(df, goal)
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(filtered_df[features])
imputed_data = imr.transform(filtered_df[features]);
rfecv.fit(imputed_data, filtered_df[goal])
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
#print "Ranking is ", rfecv.support_
for x, y in zip(filtered_df[features].columns, rfecv.support_):
if y:
final_features.append(x)
print final_features
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation "+score)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.title(goal+": Optimal number of features : %d" % rfecv.n_features_)
plt.show()
In [43]:
estimator = LinearRegression()
final_goals = ['murders', 'rapes', 'robberies', 'assaults', 'burglaries']
plot_RFECV_scores(estimator , final_goals, df, 'r2')
In [44]:
estimator = LinearRegression()
final_goals = ['murders', 'rapes', 'robberies', 'assaults', 'burglaries']
plot_RFECV_scores(estimator , final_goals, df, 'neg_mean_squared_error')
In [ ]: