In [79]:
import sklearn
import pandas
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg1 = linear_model.BayesianRidge()
reg2 = linear_model.RANSACRegressor()
reg3 = linear_model.LassoLars(alpha=.1)
reg4 = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg5 = linear_model.ElasticNetCV()
reg6 = linear_model.SGDRegressor()
reg7 = linear_model.PassiveAggressiveRegressor()
reg8 = linear_model.TheilSenRegressor()
reg9 = linear_model.HuberRegressor()
from sklearn import svm
clf = svm.SVR(C=1.0, epsilon=0.2)
models = [clf, reg, reg1,reg2,reg3,reg4,reg5,reg6,reg7,reg9]

In [80]:
dataframe = pandas.read_csv('../datasets/UCIrvineCrimeData.csv')

Cleaning the data


In [81]:
dataframe = dataframe.replace('?',np.NAN)
dict1 = dataframe.isnull().sum().to_dict()
non_zero = []
for a in dict1.keys():
	if dict1[a] > 100:
# 		print a
# 		print dict1[a]
		non_zero.append(a)

# print non_zero
for elem in non_zero:
	del dataframe[elem]

# Perhaps its better to remove this row.
# No reason in removing whole column.
dataframe= dataframe.dropna()
cols  = list(dataframe.columns.values)

cols = [ x for x in cols if x not in ['fold', 'state', 'community', 'communityname', 'county'
                                               ,'ViolentCrimesPerPop']]
# cols = ['numbUrban', 'NumInShelters']
for i in xrange(len(cols)):
    for k in xrange(i,len(cols)):
        cols1 = [cols[i], cols[k]]

Getting Training Data and Training Labels. (Training = 2/3, Test=1/3)


In [82]:
X = dataframe[list(cols)].values
total_val = len(dataframe['ViolentCrimesPerPop'].values)
percent = 2/float(3)
edge_val = int(total_val*percent)
# print
# print reg.fit(X, dataframe[cols[-1]].values)
Y = np.asarray(dataframe['ViolentCrimesPerPop'].values)
print Y


[ 0.2   0.67  0.43 ...,  0.23  0.19  0.48]

Performing Regression


In [83]:
for model in models:
    model.fit(X[:edge_val], Y[:edge_val])

Predicting on Test Values


In [85]:
y_predict = [model.predict(X[edge_val:]) for model in models]

Calculating Error of Estimate


In [86]:
error = [0] * len(models)
for i in xrange(edge_val, total_val):
    for k in xrange(len(models)):
        error[k] += (float(Y[i]) - y_predict[k][i-edge_val])**2
        
print "Error of Estimates"
error = [er/float(total_val- edge_val) for er in error]


print error


Error of Estimates
[0.019638199850900737, 0.017404206681288809, 0.016869644773174153, 0.14612669450461907, 0.046194355413773855, 0.016994298317673882, 0.016962308052769402, 0.017372763574437011, 0.018487657969123761, 0.017071350383533343]

In [ ]:


In [ ]:


In [ ]: