Importing necessary libraries
In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
%matplotlib inline
Importing the required datasets
In [2]:
rice = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/Rice.csv")
rice.head()
Out[2]:
In [3]:
rice_haryana = rice[rice["State_Name"]=="Haryana"]
rice_haryana.head()
Out[3]:
In [4]:
rainfall = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/rainfall.csv")
rainfall.head()
Out[4]:
In [5]:
rain_haryana = rainfall[rainfall["State"]=="Haryana"]
print(rain_haryana.head())
print(rain_haryana.describe())
In [6]:
X_hr = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/haryana.csv")
X_hr.head()
Out[6]:
Cleaning and preparing the datasets
In [7]:
X_finite = X_hr[np.isfinite(X_hr["X1"])]
X_finite = X_finite[np.isfinite(X_finite["X2"])]
X_finite = X_finite[np.isfinite(X_finite["X3"])]
X_finite = X_finite[np.isfinite(X_finite["X4"])]
X_finite = X_finite[np.isfinite(X_finite["Y"])]
X_finite.head()
Out[7]:
In [8]:
Xn = X_finite
Xn.describe()
Out[8]:
In [9]:
y = Xn["Y"]
X = Xn[["X1", "X2", "X3", "X4"]]
In [10]:
plt.figure(figsize=(9, 5))
plt.hist(y, bins=30)
plt.xlabel('Production Value',fontsize=15)
plt.ylabel('Occurences',fontsize=15)
plt.title('Distribution of the Rice Production Values',fontsize=18)
Out[10]:
In [11]:
Xplot = Xn[["X1", "X2", "X3", "X4","Y"]]
var_name = "X1"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Y', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Crop Produce of Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [12]:
var_name = "X2"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Y', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Crop Produce of Last to Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [13]:
var_name = "X3"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Y', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Rainfall of Present Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [14]:
var_name = "X4"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Y', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Rainfall of Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [15]:
# Z-Score Normalization
cols = list(X.columns)
for col in cols:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)
X = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]
In [16]:
X.head()
Out[16]:
Randomly splitting the dataset into training and testing sets
In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
First baseline using Linear Regression
In [18]:
alg = LinearRegression()
alg.fit(X_train, y_train)
Out[18]:
In [19]:
coef = alg.coef_
coef = coef.round(decimals=2)
np.set_printoptions(suppress=True) #gem
print("The coefficients for the linear regression model learnt are\n")
print(coef)
print()
In [20]:
y_predict = alg.predict(X_test)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [21]:
clf = LinearRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')
In [22]:
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [23]:
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [24]:
# print(type(y_test))
# print(type(y_predict))
yt = y_test.as_matrix()
print(type(yt))
In [25]:
p = pd.DataFrame()
p["y_predicted"] = y_predict/1000
p["y_test"] = yt/1000
p["y_predicted"] = p["y_predicted"].round(decimals=1)
# p["y_test"] = p["y_test"].round(decimals=1)
p.describe()
Out[25]:
In [26]:
print(p)
In [27]:
rmse/1000
Out[27]:
In [28]:
rain1 = rainfall
rain2 = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/rainfall_distt_2004-10_nax.csv")
In [29]:
print(rice.describe())
print(rain1.describe())
print(rain2.describe())
In [30]:
a = np.empty((rice.shape[0],1))*np.NAN
rice = rice.assign(X1 = a)
rice = rice.assign(X2 = a)
rice = rice.assign(X3 = a)
rice = rice.assign(X4 = a)
rice.head()
Out[30]:
Constructing the features X1 and X2, the production for the last 2 years.
In [31]:
l = rice.shape[0]
for row in range(0,l):
if row-1<0 or rice.iloc[row,1] != rice.iloc[row-1,1]:
continue
else:
rice.iloc[row,8] = rice.iloc[row-1,6]
if row-2<0 or rice.iloc[row,1] != rice.iloc[row-2,1]:
continue
else:
rice.iloc[row,9] = rice.iloc[row-2,6]
In [32]:
rice.head()
Out[32]:
In [33]:
def func(s):
x = s.strip()
return x.lower()
In [34]:
rice['ind_district'] = rice['ind_district'].apply(func)
rice['Season'] = rice['Season'].apply(func)
rain1['ind_district'] = rain1['ind_district'].apply(func)
rain2['ind_district'] = rain2['ind_district'].apply(func)
rice.head()
Out[34]:
In [35]:
rain1.head()
Out[35]:
In [36]:
# can reduce the time by searching only one variable for some cases atleast
rice = rice[rice['Season'] == 'kharif']
l = rice.shape[0]
for row in range(0,l):
dt = rice.iloc[row,1]
yr = rice.iloc[row,2]
if yr<=2002:
# rainfall for the same year
r = rain1.loc[(rain1.ind_district == dt) & (rain1.Year == yr)]
if r.shape[0] == 1:
rice.iloc[row,10] = r.iloc[0,3]
# rainfall for the previous year
r = rain1.loc[(rain1.ind_district == dt) & (rain1.Year == yr-1)]
if r.shape[0] == 1:
rice.iloc[row,11] = r.iloc[0,3]
if yr>2004:
# rainfall for the same year
r = rain2.loc[(rain2.ind_district == dt) & (rain2.Year == yr)]
if r.shape[0] == 1:
rice.iloc[row,10] = r.iloc[0,3]
# rainfall for the previous year
r = rain2.loc[(rain2.ind_district == dt) & (rain2.Year == yr-1)]
if r.shape[0] == 1:
rice.iloc[row,11] = r.iloc[0,3]
In [37]:
# X1 = prod-1
# X2 = prod-2
# X3 = rain
# X4 = rain-1
rice.describe()
Out[37]:
In [38]:
ricex = rice[np.isfinite(rice["Production"])]
ricex = ricex[np.isfinite(ricex["X1"])]
ricex = ricex[np.isfinite(ricex["X2"])]
ricex = ricex[np.isfinite(ricex["X3"])]
ricex = ricex[np.isfinite(ricex["X4"])]
ricex.head()
Out[38]:
In [46]:
X = ricex[["X1","X2","X3","X4"]]
y = ricex[["Production"]]
ricex.describe()
Out[46]:
In [40]:
plt.figure(figsize=(30, 10))
plt.hist(y, bins=250)
plt.xlabel('Production Value',fontsize=25)
plt.ylabel('Occurences',fontsize=25)
plt.title('Distribution of the Rice Production Values',fontsize=30)
Out[40]:
In [41]:
Xplot = ricex[["X1", "X2", "X3", "X4","Production"]]
var_name = "X1"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Production', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Crop Produce of Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [42]:
Xplot = ricex[["X1", "X2", "X3", "X4","Production"]]
var_name = "X2"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Production', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Crop Produce of Last to Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [43]:
Xplot = ricex[["X1", "X2", "X3", "X4","Production"]]
var_name = "X3"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Production', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Rainfall of Present Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [44]:
Xplot = ricex[["X1", "X2", "X3", "X4","Production"]]
var_name = "X4"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Production', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (Rainfall of Last Year)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [51]:
# Z-Score Normalization
cols = list(X.columns)
for col in cols:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)
In [52]:
X = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]
In [53]:
X.head()
Out[53]:
In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
In [55]:
alg = LinearRegression()
alg.fit(X_train, y_train)
Out[55]:
In [56]:
coef = alg.coef_
intercept = alg.intercept_
In [57]:
coef = coef.round(decimals=2)
pp = pprint.PrettyPrinter()
pp.pprint(coef)
pp.pprint(intercept)
In [58]:
y_predict = alg.predict(X_test)
In [59]:
yp = y_predict
yt = y_test.as_matrix()
type(y_predict)
Out[59]:
In [60]:
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [61]:
clf = LinearRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')
In [62]:
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [63]:
print(scores)
avg_rmse = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [64]:
yt = yt/1000
yp = yp/1000
yt = yt.round(decimals=1)
yp = yp.round(decimals=1)
In [65]:
yo = np.concatenate((yp,yt),axis=1)
In [66]:
p = pd.DataFrame(data=yo,columns=['Predicted','Actual'])
p.describe()
Out[66]:
In [67]:
p
Out[67]:
In [68]:
rmse/1000
Out[68]:
This feaure is missing for some states in India, so little less number of samples
In [69]:
# loading the dataset
ricep = pd.read_csv("/Users/macbook/Documents/BTP/Notebook/rice with soil(P).csv")
# Removing the rows with missing value of phosphorus
ricep = ricep[np.isfinite(ricep["phosphorus"])]
In [70]:
# Adding collumns for the other 4 features
a = np.empty((ricep.shape[0],1))*np.NAN
ricep = ricep.assign(X1 = a)
ricep = ricep.assign(X2 = a)
ricep = ricep.assign(X3 = a)
ricep = ricep.assign(X4 = a)
ricep.head()
Out[70]:
In [71]:
# Constructing features X1 and X2
l = ricep.shape[0]
for row in range(0,l):
if row-1<0 or ricep.iloc[row,1] != ricep.iloc[row-1,1]:
continue
else:
ricep.iloc[row,8] = ricep.iloc[row-1,6]
if row-2<0 or ricep.iloc[row,1] != ricep.iloc[row-2,1]:
continue
else:
ricep.iloc[row,9] = ricep.iloc[row-2,6]
In [72]:
# Making the strings in the dataset uniform, with other datasets
ricep['ind_district'] = ricep['ind_district'].apply(func)
ricep['Season'] = ricep['Season'].apply(func)
In [73]:
# Constructing features X3 and X4
l = ricep.shape[0]
for row in range(0,l):
dt = ricep.iloc[row,1]
yr = ricep.iloc[row,2]
if yr<=2002:
# rainfall for the same year
r = rain1.loc[(rain1.ind_district == dt) & (rain1.Year == yr)]
if r.shape[0] == 1:
ricep.iloc[row,10] = r.iloc[0,3]
# rainfall for the previous year
r = rain1.loc[(rain1.ind_district == dt) & (rain1.Year == yr-1)]
if r.shape[0] == 1:
ricep.iloc[row,11] = r.iloc[0,3]
if yr>2004:
# rainfall for the same year
r = rain2.loc[(rain2.ind_district == dt) & (rain2.Year == yr)]
if r.shape[0] == 1:
ricep.iloc[row,10] = r.iloc[0,3]
# rainfall for the previous year
r = rain2.loc[(rain2.ind_district == dt) & (rain2.Year == yr-1)]
if r.shape[0] == 1:
ricep.iloc[row,11] = r.iloc[0,3]
In [74]:
# Removing rows with any missing values
ricep = ricep[np.isfinite(ricep["Production"])]
ricep = ricep[np.isfinite(ricep["X1"])]
ricep = ricep[np.isfinite(ricep["X2"])]
ricep = ricep[np.isfinite(ricep["X3"])]
ricep = ricep[np.isfinite(ricep["X4"])]
ricep.head()
Out[74]:
In [75]:
ricep['phosphorus'] = ricep['phosphorus'] + 1
ricep.describe()
Out[75]:
In [76]:
X = ricep[["X1","X2","X3","X4","phosphorus"]]
y = ricep[["Production"]]
ricep.to_csv("ricep.csv")
In [77]:
Xplot = ricep[["X1", "X2", "X3", "X4", "phosphorus", "Production"]]
var_name = "phosphorus"
plt.figure(figsize=(10,6))
sns.regplot(x=var_name, y='Production', data=Xplot, scatter_kws={'alpha':0.6, 's':20})
plt.xlabel(var_name + " (in Soil)", fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.title("Distribution of y variable with feature "+var_name, fontsize=18)
plt.show()
In [78]:
# Z-Score Normalization
cols = list(X.columns)
for col in cols:
col_zscore = col + '_zscore'
X[col_zscore] = (X[col] - X[col].mean())/X[col].std(ddof=0)
In [79]:
X = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore", "phosphorus_zscore"]]
X.head()
Out[79]:
In [80]:
X.describe()
Out[80]:
In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
In [82]:
alg = LinearRegression()
alg.fit(X_train, y_train)
Out[82]:
In [83]:
coef = alg.coef_
intercept = alg.intercept_
In [84]:
coef = coef.round(decimals=2)
pp = pprint.PrettyPrinter()
pp.pprint(coef)
pp.pprint(intercept)
In [85]:
y_predict = alg.predict(X_test)
In [86]:
yp = y_predict
yt = y_test.as_matrix()
type(y_predict)
Out[86]:
In [87]:
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [88]:
clf = LinearRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')
In [89]:
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [90]:
print(scores)
avg_rmse_phos = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [91]:
yt = yt/1000
yp = yp/1000
yt = yt.round(decimals=1)
yp = yp.round(decimals=1)
In [92]:
yo = np.concatenate((yp,yt),axis=1)
In [93]:
p = pd.DataFrame(data=yo,columns=['Predicted','Actual'])
p.describe()
Out[93]:
In [94]:
p
Out[94]:
In [95]:
rmse/1000
Out[95]:
In [96]:
# Just the 4 original features (no soil data)
X_old = X[["X1_zscore", "X2_zscore", "X3_zscore", "X4_zscore"]]
In [97]:
# Seed is fixed, so the vector y_test is going to same as before
X_train, X_test, y_train, y_test = train_test_split(X_old, y, test_size=0.2, random_state=1)
In [98]:
alg = LinearRegression()
alg.fit(X_train, y_train)
Out[98]:
In [99]:
coef = alg.coef_
intercept = alg.intercept_
In [100]:
coef = coef.round(decimals=2)
pp = pprint.PrettyPrinter()
pp.pprint(coef)
pp.pprint(intercept)
In [101]:
y_predict = alg.predict(X_test)
yp = y_predict
yt = y_test.as_matrix()
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [102]:
clf = LinearRegression()
scores = cross_val_score(clf, X_old, y, cv=5, scoring='neg_mean_squared_error')
In [103]:
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [104]:
print(scores)
avg_rmse_orig = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
Avg RMSE with original 4 features : 51874.9
Avg RMSE with 5 features(soil too) : 52272.6
In [105]:
X_no_rain = X[["X1_zscore", "X2_zscore"]]
In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_no_rain, y, test_size=0.2, random_state=1)
In [107]:
alg = LinearRegression()
alg.fit(X_train, y_train)
Out[107]:
In [108]:
coef = alg.coef_
intercept = alg.intercept_
In [109]:
coef = coef.round(decimals=2)
pp = pprint.PrettyPrinter()
pp.pprint(coef)
pp.pprint(intercept)
In [110]:
y_predict = alg.predict(X_test)
yp = y_predict
yt = y_test.as_matrix()
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [111]:
clf = LinearRegression()
scores = cross_val_score(clf, X_no_rain, y, cv=5, scoring='neg_mean_squared_error')
In [112]:
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
In [113]:
print(scores)
avg_rmse_no_rain = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [114]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[1,2,3,4,5,6,7,7.1,7.2,7.3,8,9,10])
reg.fit(X_old, y)
reg.alpha_
Out[114]:
In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_old, y, test_size=0.2, random_state=1)
reg = linear_model.Ridge(alpha = 7.1)
reg.fit (X_train, y_train)
print(reg.coef_)
In [116]:
y_predict = reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [117]:
clf = linear_model.Ridge(alpha = 7.1)
scores = cross_val_score(clf, X_old, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse_ridge = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())
In [118]:
from sklearn import linear_model
reg = linear_model.LassoCV(alphas=[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1])
reg.fit(X_old, y)
reg.alpha_
Out[118]:
In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_old, y, test_size=0.2, random_state=1)
las = linear_model.Lasso(alpha = 0.01)
las.fit (X_train, y_train)
print(las.coef_)
In [120]:
y_predict = las.predict(X_test)
rmse = sqrt(mean_squared_error(y_predict, y_test))
print(rmse)
In [121]:
clf = linear_model.Lasso(alpha = 0.01)
scores = cross_val_score(clf, X_old, y, cv=5, scoring='neg_mean_squared_error')
for i in range(0,5):
scores[i] = sqrt(-1*scores[i])
print(scores)
avg_rmse_las = scores.mean()
print("\n\nAvg RMSE is ",scores.mean())