Establishing a Baseline for the Problem

Using variety of regression algorithms (non linear)

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import pprint
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.svm import SVR

%matplotlib inline

# importing the dataset we prepared and saved using Baseline 1 Notebook
ricep = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/BTP/ricep.csv")

Unnamed: 0 State_Name ind_district Crop_Year Season Crop Area Production phosphorus X1 X2 X3 X4
0 15 Andhra Pradesh anantapur 1999 kharif Rice 37991.0 105082.0 0.0 96800.0 75400.0 643.720 881.473
1 16 Andhra Pradesh anantapur 2000 kharif Rice 39905.0 117680.0 0.0 105082.0 96800.0 767.351 643.720
2 17 Andhra Pradesh anantapur 2001 kharif Rice 32878.0 95609.0 0.0 117680.0 105082.0 579.338 767.351
3 18 Andhra Pradesh anantapur 2002 kharif Rice 29066.0 66329.0 0.0 95609.0 117680.0 540.070 579.338
4 21 Andhra Pradesh anantapur 2005 kharif Rice 25008.0 69972.0 0.0 85051.0 44891.0 819.700 564.500

ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep["phosphorus"] = ricep["phosphorus"]*10

ricep["value"] = ricep["Production"]/ricep["Area"]

X = ricep[["X1","X2","X3","X4","phosphorus"]]
y = ricep[["value"]]*1000

# Z-Score Normalization OR try using the sklearn internal normalizing by setting mormalize flag = true !!!

cols = list(X.columns)
for col in cols:
    col_zscore = col + '_zscore'
    X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)

X_ = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore", "phosphorus_zscore"]]

X1_zscore X2_zscore X3_zscore X4_zscore phosphorus_zscore
0 -0.285176 -0.374714 -0.457800 0.021735 -0.837691
1 -0.247120 -0.276111 -0.198113 -0.496827 -0.837691
2 -0.189232 -0.237950 -0.593035 -0.227176 -0.837691
3 -0.290648 -0.179903 -0.675518 -0.637250 -0.837691
4 -0.339162 -0.515288 -0.088153 -0.669613 -0.837691

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=1)

First checking the avg RMSE for Linear Regression

clf = LinearRegression()
scores = cross_val_score(clf, X_, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[ 1030.92314374  1109.37929379   972.36266895  1487.52744177   491.48595541]

Avg RMSE is  1018.33570073
Epsilon-Support Vector Regression (SVR)

RBF Kernel

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=500000.0, epsilon=0.1, kernel='rbf', gamma=0.0008)
scores = cross_val_score(clf, X_, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])

avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[  904.09013921   940.99998887   981.97853142  1616.00179024   568.93419484]

Avg RMSE is  1002.40092892

# Just the 4 original features (no soil data)
X_old = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])

avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[  903.93008696   753.88394413   765.69751566  1574.251674     636.95214188]

Avg RMSE is  926.943072526

SVR : 927

LR : 1018

SVR (RBF kernel) works better than Linear Regression.

Also, the soil feature, for now, does more harm than good (Phosphorous content)

Lets check the importance of Rain Data

# Just 2 features (no rain data)
X_nr = X[["X1_zscore", "X2_zscore"]]

# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_nr, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])

avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[ 1039.57563055   863.77364865   944.40471     1492.31174906   672.96822263]

Avg RMSE is  1002.60679218

The Rain data does helps us

Lets try for SVR with other kernels ...

Degree 3 Polynomial

# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=3, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[  906.20976415   837.77643762  1049.76326739  1568.88777167   504.49443066]

Avg RMSE is  973.426334297

Polynomial Kernel also does better than Linear Regression

Degree 4 Polynomial

# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=4, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
    scores[i] = sqrt(-1*scores[i])
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())

[  907.10874357   787.20784909   848.64917648  1570.06140194   557.83575489]

Avg RMSE is  934.172585194

