# Establishing a Baseline for the Problem

## Using variety of regression algorithms (non linear)

``````

In :

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import pprint
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.svm import SVR

%matplotlib inline

``````
``````

In :

# importing the dataset we prepared and saved using Baseline 1 Notebook

``````
``````

Out:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

Unnamed: 0
State_Name
ind_district
Crop_Year
Season
Crop
Area
Production
phosphorus
X1
X2
X3
X4

0
15
anantapur
1999
kharif
Rice
37991.0
105082.0
0.0
96800.0
75400.0
643.720
881.473

1
16
anantapur
2000
kharif
Rice
39905.0
117680.0
0.0
105082.0
96800.0
767.351
643.720

2
17
anantapur
2001
kharif
Rice
32878.0
95609.0
0.0
117680.0
105082.0
579.338
767.351

3
18
anantapur
2002
kharif
Rice
29066.0
66329.0
0.0
95609.0
117680.0
540.070
579.338

4
21
anantapur
2005
kharif
Rice
25008.0
69972.0
0.0
85051.0
44891.0
819.700
564.500

``````
``````

In :

ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep["phosphorus"] = ricep["phosphorus"]*10

``````
``````

In :

ricep["value"] = ricep["Production"]/ricep["Area"]

``````
``````

In :

X = ricep[["X1","X2","X3","X4","phosphorus"]]
y = ricep[["value"]]*1000

``````
``````

In :

# Z-Score Normalization OR try using the sklearn internal normalizing by setting mormalize flag = true !!!

cols = list(X.columns)
for col in cols:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)

``````
``````

/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

``````
``````

In :

X_ = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore", "phosphorus_zscore"]]

``````
``````

Out:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

X1_zscore
X2_zscore
X3_zscore
X4_zscore
phosphorus_zscore

0
-0.285176
-0.374714
-0.457800
0.021735
-0.837691

1
-0.247120
-0.276111
-0.198113
-0.496827
-0.837691

2
-0.189232
-0.237950
-0.593035
-0.227176
-0.837691

3
-0.290648
-0.179903
-0.675518
-0.637250
-0.837691

4
-0.339162
-0.515288
-0.088153
-0.669613
-0.837691

``````
``````

In :

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=1)

``````

### First checking the avg RMSE for Linear Regression

``````

In :

clf = LinearRegression()
scores = cross_val_score(clf, X_, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[ 1030.92314374  1109.37929379   972.36266895  1487.52744177   491.48595541]

Avg RMSE is  1018.33570073

/usr/local/lib/python3.6/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
warnings.warn(mesg, RuntimeWarning)

``````
``````

In [ ]:

``````

# Epsilon-Support Vector Regression (SVR)

### RBF Kernel

``````

In :

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=500000.0, epsilon=0.1, kernel='rbf', gamma=0.0008)
scores = cross_val_score(clf, X_, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

``````
``````

In :

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[  904.09013921   940.99998887   981.97853142  1616.00179024   568.93419484]

Avg RMSE is  1002.40092892

``````
``````

In :

# Just the 4 original features (no soil data)
X_old = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]

``````
``````

In :

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[  903.93008696   753.88394413   765.69751566  1574.251674     636.95214188]

Avg RMSE is  926.943072526

``````

### Lets check the importance of Rain Data

``````

In :

# Just 2 features (no rain data)
X_nr = X[["X1_zscore", "X2_zscore"]]

``````
``````

In :

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_nr, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[ 1039.57563055   863.77364865   944.40471     1492.31174906   672.96822263]

Avg RMSE is  1002.60679218

``````

## Lets try for SVR with other kernels ...

#### Degree 3 Polynomial

``````

In :

# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=3, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[  906.20976415   837.77643762  1049.76326739  1568.88777167   504.49443066]

Avg RMSE is  973.426334297

``````

### Polynomial Kernel also does better than Linear Regression

#### Degree 4 Polynomial

``````

In :

# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=4, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])

print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

``````
``````

[  907.10874357   787.20784909   848.64917648  1570.06140194   557.83575489]

Avg RMSE is  934.172585194

``````
``````

In [ ]:

``````