In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.svm import SVR
%matplotlib inline
In [2]:
# importing the dataset we prepared and saved using Baseline 1 Notebook
ricep = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/BTP/ricep.csv")
ricep.head()
Out[2]:
In [3]:
ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep["phosphorus"] = ricep["phosphorus"]*10
In [4]:
ricep["value"] = ricep["Production"]/ricep["Area"]
In [5]:
X = ricep[["X1","X2","X3","X4","phosphorus"]]
y = ricep[["value"]]*1000
In [6]:
# Z-Score Normalization OR try using the sklearn internal normalizing by setting mormalize flag = true !!!
cols = list(X.columns)
for col in cols:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)
In [7]:
X_ = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore", "phosphorus_zscore"]]
X_.head()
Out[7]:
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=1)
In [9]:
clf = LinearRegression()
scores = cross_val_score(clf, X_, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [ ]:
In [10]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=500000.0, epsilon=0.1, kernel='rbf', gamma=0.0008)
scores = cross_val_score(clf, X_, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [11]:
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [12]:
# Just the 4 original features (no soil data)
X_old = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]
In [13]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [14]:
# Just 2 features (no rain data)
X_nr = X[["X1_zscore", "X2_zscore"]]
In [15]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.027)
scores = cross_val_score(clf, X_nr, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [16]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=3, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [17]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=4, coef0=2)
scores = cross_val_score(clf, X_old, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [ ]: