In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.svm import SVR
%matplotlib inline
In [2]:
# importing the dataset we prepared and saved using Data Cleaning Notebook
rice = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/BTP/Satellite/Rice_Ready.csv")
rice = rice.drop(["Unnamed: 0"],axis=1)
rice.head()
Out[2]:
In [3]:
colss = list(rice.columns.values)
In [4]:
len(colss)
Out[4]:
In [5]:
select = colss[8:226]
In [6]:
X = rice[select]
y = rice["Value"]*1000
In [7]:
X.describe()
Out[7]:
In [8]:
# Z-Score Normalization
colms = list(X.columns)
for col in colms:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)
In [9]:
cols = list(X.columns.values)
len(cols)
Out[9]:
In [32]:
# Contains all the features (Last 2 years Crop Production and the Satellite Data)
select = cols[218:436]
X_all = X[select]
# Just the last 2 years Crop Production
select1 = cols[218:220]
X_crp = X[select1]
# Just the Satellite Data
select2 = cols[220:436]
X_sat = X[select2]
In [11]:
clf = LinearRegression()
scores = cross_val_score(clf, X_all, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\nAvg RMSE is "+str(scores.mean()))
In [12]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=8000.0, epsilon=0.1, kernel='rbf', gamma=0.001)
scores = cross_val_score(clf, X_all, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [13]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=2000.0, epsilon=0.1, kernel='rbf', gamma=0.008)
scores = cross_val_score(clf, X_all, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [14]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000000.0, epsilon=0.1, kernel='rbf', gamma=0.000001)
scores = cross_val_score(clf, X_all, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [15]:
clf = LinearRegression()
scores = cross_val_score(clf, X_crp, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [16]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=500000.0, epsilon=0.1, kernel='rbf', gamma=0.0008)
scores = cross_val_score(clf, X_crp, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [17]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=8000.0, epsilon=0.1, kernel='rbf', gamma=0.01)
scores = cross_val_score(clf, X_crp, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [ ]:
In [ ]:
In [18]:
clf = LinearRegression()
scores = cross_val_score(clf, X_sat, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [19]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.01)
scores = cross_val_score(clf, X_sat, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [20]:
from sklearn.decomposition import PCA
In [33]:
def fPCA(x):
pca = PCA(n_components=x)
pcax = pca.fit_transform(X_all)
# 5 Fold CV, to calculate avg RMSE
clf = SVR(C=2000.0, epsilon=0.1, kernel='rbf', gamma=0.008)
scores = cross_val_score(clf, pcax, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [34]:
fPCA(42)
In [35]:
import xgboost as xgb
In [36]:
# prepare dict of params for xgboost to run with
xgb_params = {
'n_trees': 500,
'eta': 0.005,
'max_depth': 4,
'subsample': 0.95,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'base_score': np.mean(y), # base prediction = mean(target)
'silent': 1
}
In [63]:
# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_all, y)
# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
dtrain,
num_boost_round=900, # increase to have better results (~700)
early_stopping_rounds=50,
verbose_eval=50,
show_stdv=False
)
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
In [64]:
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
In [65]:
from sklearn.metrics import r2_score
print(r2_score(dtrain.get_label(), model.predict(dtrain)))
In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=1)
dtest = xgb.DMatrix(X_test)
y_predict = model.predict(dtest)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=1936)
In [116]:
dtrain = xgb.DMatrix(X_train, y_train)
# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
dtrain,
num_boost_round=700, # increase to have better results (~700)
early_stopping_rounds=50,
verbose_eval=50,
show_stdv=False
)
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
In [117]:
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
print(r2_score(dtrain.get_label(), model.predict(dtrain)))
In [118]:
dtest = xgb.DMatrix(X_test)
y_predict = model.predict(dtest)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [120]:
avg_rmse = (481.67 + 542.952 + 577.985 + 544.2265 + 589.76 + 494.5)/6
avg_rmse
Out[120]:
In [129]:
yt = y_test.as_matrix()
p = pd.DataFrame()
p["y_predicted"] = y_predict
p["y_test"] = yt
p["y_predicted"] = p["y_predicted"].round(decimals=1)
p["y_test"] = p["y_test"].round(decimals=1)
print p
In [130]:
X_train, X_test, y_train, y_test = train_test_split(X_crp, y, test_size=0.2, random_state=1)
In [131]:
dtrain = xgb.DMatrix(X_train, y_train)
# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
dtrain,
num_boost_round=700, # increase to have better results (~700)
early_stopping_rounds=50,
verbose_eval=50,
show_stdv=False
)
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
In [132]:
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
print(r2_score(dtrain.get_label(), model.predict(dtrain)))
In [133]:
dtest = xgb.DMatrix(X_test)
y_predict2 = model.predict(dtest)
rmse = sqrt(mean_squared_error(y_predict2, y_test))
print(rmse)
In [ ]:
In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_crp, y, test_size=0.2, random_state=1)
clf = SVR(C=8000.0, epsilon=0.1, kernel='rbf', gamma=0.01)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [56]:
yt = y_test.as_matrix()
p = pd.DataFrame()
p["y_predicted"] = y_predict
p["y_test"] = yt
p["y_predicted"] = p["y_predicted"].round(decimals=1)
p["y_test"] = p["y_test"].round(decimals=1)
p.describe()
Out[56]:
In [150]:
# print (p)
In [179]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=4, coef0=6)
scores = cross_val_score(clf, X_all, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [192]:
# 5 Fold CV, to calculate avg RMSE
clf = SVR(kernel='poly', gamma='auto', degree=3, coef0=6)
scores = cross_val_score(clf, X_crp, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [218]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[1,2,3,4,5,6,7,7.1,7.2,7.3,8,9,100,120,125,130,140,150])
reg.fit(X_all, y)
reg.alpha_
Out[218]:
In [225]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=50)
reg = linear_model.Ridge(alpha = 130)
reg.fit (X_train, y_train)
y_pred = reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_pred, y_test))
print(rmse)
Avg RMSE is is 700
In [232]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[0.1,0.9,1,1.5,2,3,4,5,6,7,7.1,7.2,7.3,8,9,100,120,125,130,140,150])
reg.fit(X_crp, y)
reg.alpha_
Out[232]:
In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_crp, y, test_size=0.2, random_state=50)
reg = linear_model.Ridge(alpha = 1)
reg.fit (X_train, y_train)
y_pred = reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_pred, y_test))
print(rmse)
In [ ]:
In [ ]:
In [258]:
# data to plot
n_groups = 4
rmse_crp = (800, 750, 750, 750)
rmse_sat = (700, 830, 830, 540)
# create plot
# fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
plt.figure(figsize=(15,10))
rects1 = plt.bar(index, rmse_crp, bar_width,
alpha=opacity,
color='b',
label='Without Satellite Data')
rects2 = plt.bar(index + bar_width, rmse_sat, bar_width,
alpha=opacity,
color='g',
label='With Satellite Data')
plt.xlabel('ML Algorithms', fontsize=35)
plt.ylabel('RMSE', fontsize=35)
# plt.title('Effect of Satellite Data', fontsize=45)
plt.xticks(index + bar_width/2, ('Linear Regression \n (Ridge, Lasso)', 'SVR (Gaussian Kernel)', 'SVR (Polynomial)', 'XGBoost'), fontsize=17)
plt.legend(fontsize=15)
# plt.tight_layout()
plt.show()
In [ ]:
In [259]:
pca = PCA(n_components=42)
pcax = pca.fit_transform(X_all)
In [268]:
X_train, X_test, y_train, y_test = train_test_split(pcax, y, test_size=0.2)
In [269]:
dtrain = xgb.DMatrix(X_train, y_train)
# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
dtrain,
num_boost_round=700, # increase to have better results (~700)
early_stopping_rounds=50,
verbose_eval=50,
show_stdv=False
)
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
In [270]:
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
print(r2_score(dtrain.get_label(), model.predict(dtrain)))
In [271]:
dtest = xgb.DMatrix(X_test)
y_predict = model.predict(dtest)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [44]:
# # y_pred = model.predict(dtest)
# # 5 Fold CV, to calculate avg RMSE
# # clf = SVR(C=1000.0, epsilon=0.1, kernel='rbf', gamma=0.01)
# scores = cross_val_score(model, X_all, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')
# for i in range(0,5):
# scores[i] = sqrt(-1*scores[i])
# print(scores)
# avg_rmse = scores.mean()
# print("\n\nAvg RMSE is ",scores.mean())
In [ ]: