In [79]:

    
import sklearn
import pandas
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg1 = linear_model.BayesianRidge()
reg2 = linear_model.RANSACRegressor()
reg3 = linear_model.LassoLars(alpha=.1)
reg4 = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg5 = linear_model.ElasticNetCV()
reg6 = linear_model.SGDRegressor()
reg7 = linear_model.PassiveAggressiveRegressor()
reg8 = linear_model.TheilSenRegressor()
reg9 = linear_model.HuberRegressor()
from sklearn import svm
clf = svm.SVR(C=1.0, epsilon=0.2)
models = [clf, reg, reg1,reg2,reg3,reg4,reg5,reg6,reg7,reg9]



In [80]:

    
dataframe = pandas.read_csv('../datasets/UCIrvineCrimeData.csv')

Cleaning the data



In [81]:

    
dataframe = dataframe.replace('?',np.NAN)
dict1 = dataframe.isnull().sum().to_dict()
non_zero = []
for a in dict1.keys():
	if dict1[a] > 100:
# 		print a
# 		print dict1[a]
		non_zero.append(a)

# print non_zero
for elem in non_zero:
	del dataframe[elem]

# Perhaps its better to remove this row.
# No reason in removing whole column.
dataframe= dataframe.dropna()
cols  = list(dataframe.columns.values)

Performing Regression



In [82]:

    
cols = [ x for x in cols if x not in ['fold', 'state', 'community', 'communityname', 'county'
                                               ,'ViolentCrimesPerPop']]
# cols = ['numbUrban', 'NumInShelters']
print len(cols)
# print cols[0]
cols1 = cols
all_errors = []

 
X = dataframe[list(cols1)].values
total_val = len(dataframe['ViolentCrimesPerPop'].values)
percent = 2/float(3)
edge_val = int(total_val*percent)
Y = np.asarray(dataframe['ViolentCrimesPerPop'].values)
for model in models:
    model.fit(X[:edge_val], Y[:edge_val])
y_predict = [model.predict(X[edge_val:]) for model in models]
error = [0] * len(models)
for i in xrange(edge_val, total_val):
    for k in xrange(len(models)):
        print k
        error[k] += (float(Y[i]) - y_predict[k][i-edge_val])**2

#     print "Error of Estimates"
error = [er/float(total_val- edge_val) for er in error]
all_errors.append(error)

#     print error

print "Im done"



In [83]:

    
# arr = np.array(all_errors)
# print np.min(arr, axis=0)
print error
best_f= np.argmin(arr, axis=0)
print "SVM ranked"
print arr[:,0].argsort()
print "Bayesian ranked"
print arr[:,2].argsort()


print best_f

bf = [cols[b1] for b1 in best_f]
print bf

# print np.min(arr,axis=1)
models_chosen = np.argmin(arr,axis=1)
# print models_chosen
fm = {}
for best in models_chosen:
    if best in fm:
        fm[best] +=1
    else:
        fm[best] = 1
print fm









    



[0.019638199850900737, 0.017404206681288809, 0.016869644773174153, 0.23166476735433161, 0.046194355413773855, 0.016994298317673882, 0.016962308052769402, 0.017570772386302138, 0.022245577713124888, 0.017071350383533343]
SVM ranked
[44 50 43 46 45 49  3 27 28 15 17 40 41  2 67 38 68 71 77 32 89 30 90 19 74
  0 69 73 10 29 12 99 62 51 39 22 72 70 33 78 20  5 86 37 61 13 63  8 60 31
 55 54  7 65 36 53 52 93 82 26 57 11 58 56 66 59 79 25 91 23 83 64 85 80 84
 21 96 42 24 81 14  6 97  1 16 34 48 92 76  9 18 94 47  4 95 75 87 88 35 98]
Bayesian ranked
[44 50 43 45 46  3 17  2 40 41 15 38 28 67 32 49 77 30 27 73 19 69 74 12 68
 71 99 29  0 89 10 70 20 90 62 78 37 33 72  5 39 31 86 51 13 55 61 36 54 22
 63 82 60 26 85 83 84 58 59 66 21 53 57 56 79 52 93 80 24  7 96 81 97 91 14
 65 48  8 16 25 92 42 11  9  6 23 35 64 87  4  1 34 47 95 98 76 75 18 88 94]
[44 44 44 50  0 44 44 50 49 50 44]
['PctKids2Par', 'PctKids2Par', 'PctKids2Par', 'PctIlleg', 'population', 'PctKids2Par', 'PctKids2Par', 'PctIlleg', 'NumIlleg', 'PctIlleg', 'PctKids2Par']
{0: 19, 1: 13, 2: 22, 4: 5, 5: 17, 6: 14, 7: 3, 9: 3, 10: 4}



In [53]:

    
cols[44]









    Out[53]:





'PctKids2Par'



In [27]:

    
cols[50]









    Out[27]:





'PctIlleg'



In [ ]: