In [79]:
import sklearn
import pandas
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg1 = linear_model.BayesianRidge()
reg2 = linear_model.RANSACRegressor()
reg3 = linear_model.LassoLars(alpha=.1)
reg4 = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg5 = linear_model.ElasticNetCV()
reg6 = linear_model.SGDRegressor()
reg7 = linear_model.PassiveAggressiveRegressor()
reg8 = linear_model.TheilSenRegressor()
reg9 = linear_model.HuberRegressor()
from sklearn import svm
clf = svm.SVR(C=1.0, epsilon=0.2)
models = [clf, reg, reg1,reg2,reg3,reg4,reg5,reg6,reg7,reg9]
In [80]:
dataframe = pandas.read_csv('../datasets/UCIrvineCrimeData.csv')
In [81]:
dataframe = dataframe.replace('?',np.NAN)
dict1 = dataframe.isnull().sum().to_dict()
non_zero = []
for a in dict1.keys():
if dict1[a] > 100:
# print a
# print dict1[a]
non_zero.append(a)
# print non_zero
for elem in non_zero:
del dataframe[elem]
# Perhaps its better to remove this row.
# No reason in removing whole column.
dataframe= dataframe.dropna()
cols = list(dataframe.columns.values)
cols = [ x for x in cols if x not in ['fold', 'state', 'community', 'communityname', 'county'
,'ViolentCrimesPerPop']]
# cols = ['numbUrban', 'NumInShelters']
for i in xrange(len(cols)):
for k in xrange(i,len(cols)):
cols1 = [cols[i], cols[k]]
In [82]:
X = dataframe[list(cols)].values
total_val = len(dataframe['ViolentCrimesPerPop'].values)
percent = 2/float(3)
edge_val = int(total_val*percent)
# print
# print reg.fit(X, dataframe[cols[-1]].values)
Y = np.asarray(dataframe['ViolentCrimesPerPop'].values)
print Y
In [83]:
for model in models:
model.fit(X[:edge_val], Y[:edge_val])
In [85]:
y_predict = [model.predict(X[edge_val:]) for model in models]
In [86]:
error = [0] * len(models)
for i in xrange(edge_val, total_val):
for k in xrange(len(models)):
error[k] += (float(Y[i]) - y_predict[k][i-edge_val])**2
print "Error of Estimates"
error = [er/float(total_val- edge_val) for er in error]
print error
In [ ]:
In [ ]:
In [ ]: