In [1]:
# For cross-validation
FRACTION_TRAINING = 0.5
# For keeping output
detailed_output = {}
# dictionary for keeping model
model_dict = {}
In [2]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from matplotlib.colors import ListedColormap
from sklearn import linear_model
%matplotlib inline
plt.style.use("ggplot")
"""
Data Preprocessing
EXPERIMENT_DATA - Contain training data
EVALUATION_DATA - Contain testing data
"""
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
# Shuffle Data
EXPERIMENT_DATA = sklearn.utils.shuffle(EXPERIMENT_DATA)
EXPERIMENT_DATA = EXPERIMENT_DATA.reset_index(drop=True)
TRAINING_DATA = EXPERIMENT_DATA[:int(FRACTION_TRAINING*len(EXPERIMENT_DATA))]
TESTING_DATA = EXPERIMENT_DATA[int(FRACTION_TRAINING*len(EXPERIMENT_DATA)):]
# Consider only graduated species
TRAINING_DATA_GRAD = TRAINING_DATA[TRAINING_DATA["GRAD"] == "YES"]
TESTING_DATA_GRAD = TESTING_DATA[TESTING_DATA["GRAD"] == "YES"]
In [3]:
print("Graduate Training Data size: {}".format(len(TRAINING_DATA_GRAD)))
print("Graduate Testing Data size: {}".format(len(TESTING_DATA_GRAD)))
In [4]:
def plotRegression(data):
plt.figure(figsize=(8,8))
###########################
# 1st plot: linear scale
###########################
bagSold = np.asarray(data["BAGSOLD"]).reshape(-1, 1).astype(np.float)
rm = np.asarray(data["RM"]).reshape(-1,1).astype(np.float)
bias_term = np.ones_like(rm)
x_axis = np.arange(rm.min(),rm.max(),0.01)
# Liear Regression
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(rm, bagSold)
bagSold_prediction = regr.predict(rm)
print("Coefficients = {}".format(regr.coef_))
print("Intercept = {}".format(regr.intercept_))
# Find MSE
mse = sklearn.metrics.mean_squared_error(bagSold, bagSold_prediction)
# plt.subplot("121")
plt.title('Linear Regression RM vs. Bagsold')
true_value = plt.plot(rm, bagSold, 'ro', label='True Value')
regression_line = plt.plot(x_axis, regr.intercept_[0] + regr.coef_[0][0]*x_axis, color="green")
plt.legend(["true_value", "Regression Line\nMSE = {:e}".format(mse)])
plt.xlabel("RM")
plt.ylabel("Bagsold")
plt.xlim(rm.min(),rm.max())
detailed_output["MSE of linear regression on entire dataset (linear scale)"] = mse
plt.savefig("linear_reg_entire_dataset_linearscale.png")
plt.show()
#######################
# 2nd plot: log scale
#######################
plt.figure(figsize=(8,8))
bagSold = np.log(bagSold)
# Linear Regression
regr = linear_model.LinearRegression()
regr.fit(rm, bagSold)
bagSold_prediction = regr.predict(rm)
print("Coefficients = {}".format(regr.coef_))
print("Intercept = {}".format(regr.intercept_))
# Find MSE
mse = sklearn.metrics.mean_squared_error(bagSold, bagSold_prediction)
# plt.subplot("122")
plt.title('Linear Regression RM vs. log of Bagsold')
true_value = plt.plot(rm,bagSold, 'ro', label='True Value')
regression_line = plt.plot(x_axis, regr.intercept_[0] + regr.coef_[0][0]*x_axis, color="green")
plt.legend(["true_value", "Regression Line\nMSE = {:e}".format(mse)])
plt.xlabel("RM")
plt.ylabel("log Bagsold")
plt.xlim(rm.min(),rm.max())
detailed_output["MSE of linear regression on entire dataset (log scale)"] = mse
# plt.savefig("linear_reg_entire_dataset_logscale.png")
plt.show()
In [5]:
# Coefficients = [[-14956.36881671]]
# Intercept = [ 794865.84758174]
plotRegression(TRAINING_DATA_GRAD)
In [6]:
location_map = list(set(TRAINING_DATA_GRAD["LOCATION"]))
location_map.sort()
# print(location_map)
In [7]:
list_location = []
list_avg_rm = []
list_avg_yield = []
for val in location_map:
avg_rm = np.average(TRAINING_DATA_GRAD[EXPERIMENT_DATA["LOCATION"] == str(val)]["RM"])
avg_yield = np.average(TRAINING_DATA_GRAD[EXPERIMENT_DATA["LOCATION"] == str(val)]["YIELD"])
list_location.append(str(val))
list_avg_rm.append(avg_rm)
list_avg_yield.append(avg_yield)
# print("{} = {},{}".format(val,avg_rm,avg_yield))
In [8]:
plt.title("Average RM and YIELD for each location")
plt.plot(list_avg_rm, list_avg_yield, 'ro')
for i, txt in enumerate(list_location):
if int(txt) <= 1000:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="blue")
elif int(txt) <= 2000:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="red")
elif int(txt) <= 3000:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="green")
elif int(txt) <= 4000:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="black")
elif int(txt) <= 5000:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="orange")
else:
plt.annotate(txt, (list_avg_rm[i],list_avg_yield[i]), color="purple")
plt.show()
From the preliminary analysis, we find that the number of different locateion in the dataset is 140. The location in the dataset is encoded as a 4-digit number. We first expected that we can group the quality of the species based on the location parameters. We then plot the average of RM and YIELD for each location, which is shown below:
According to prior analaysis, it appears that we can possibly categorize species on location. The approach we decide to adopt is to use first digit of the location number as a categorizer. The histogram in the previous section indicates that there exists roughly about 7 groups. Notice that the leftmost and rightmost columns seem to be outliers.
In [9]:
# Calculate the number of possible locations
location_set = set(TRAINING_DATA_GRAD["LOCATION"])
print("The number of possible location is {}.".format(len(location_set)))
location_histogram_list = []
for location in sorted(location_set):
amount = len(TRAINING_DATA_GRAD[TRAINING_DATA_GRAD["LOCATION"] == str(location)])
for j in range(amount):
location_histogram_list.append(int(location))
# print("Location {} has {:>3} species".format(location, amount))
plt.title("Histogram of each location")
plt.xlabel("Location Number")
plt.ylabel("Amount")
plt.hist(location_histogram_list, bins=7, range=(0,7000))
plt.savefig("location_histogram.png")
plt.show()
In [10]:
# Convert location column to numeric
TRAINING_DATA_GRAD["LOCATION"] = TRAINING_DATA_GRAD["LOCATION"].apply(pd.to_numeric)
# Separate training dataset into 7 groups
dataByLocation = []
for i in range(7):
dataByLocation.append(TRAINING_DATA_GRAD[(TRAINING_DATA_GRAD["LOCATION"] < ((i+1)*1000)) & (TRAINING_DATA_GRAD["LOCATION"] >= (i*1000))])
In [11]:
for i in range(len(dataByLocation)):
data = dataByLocation[i]
bagSold = np.log(np.asarray(data["BAGSOLD"]).reshape(-1,1).astype(np.float))
rm = np.asarray(data["RM"]).reshape(-1,1).astype(np.float)
# Liear Regression
regr = linear_model.LinearRegression()
regr.fit(rm, bagSold)
model_dict[i] = regr
bagSold_prediction = regr.predict(rm)
x_axis = np.arange(rm.min(), rm.max(), 0.01).reshape(-1,1)
# Find MSE
mse = sklearn.metrics.mean_squared_error(bagSold, bagSold_prediction)
print(mse, np.sqrt(mse))
detailed_output["number of data point on location {}xxx".format(i)] = len(data)
detailed_output["MSE on location {}xxx log scale".format(i)] = mse
plt.figure(figsize=(8,8))
# plt.subplot("{}".format(int(str(len(dataByLocation))+str(1)+str(i+1))))
plt.title("Linear Regression RM vs. Log Bagsold on Location {}xxx".format(i))
true_value = plt.plot(rm,bagSold, 'ro', label='True Value')
regression_line = plt.plot(x_axis, regr.predict(x_axis), color="green")
plt.legend(["true_value", "Regression Line\nMSE = {:e}".format(mse)])
# plt.show()
plt.xlim(rm.min(),rm.max())
plt.savefig("location{}.png".format(i))
In [12]:
# Test with validation set
TESTING_DATA_GRAD = TESTING_DATA_GRAD.reset_index(drop=True)
Xtest = np.column_stack((TESTING_DATA_GRAD["LOCATION"],
TESTING_DATA_GRAD["RM"],
TESTING_DATA_GRAD["YIELD"]))
ytest = TESTING_DATA_GRAD["BAGSOLD"].astype(np.float)
log_ytest = np.log(ytest)
In [13]:
ypredicted = []
for row in Xtest:
location = row[0]
rm_val = row[1]
yield_val = row[2]
model = model_dict[int(location[0])]
prediction = model.predict(rm_val)[0][0]
ypredicted.append(prediction)
ypredicted = np.array(ypredicted)
In [14]:
# MSE error
sklearn.metrics.mean_squared_error(log_ytest, ypredicted)
Out[14]:
In [15]:
bagSold = np.log(np.asarray(TRAINING_DATA_GRAD["BAGSOLD"]).reshape(-1, 1).astype(np.float))
rm = np.asarray(TRAINING_DATA_GRAD["RM"]).reshape(-1,1).astype(np.float)
yield_val = np.asarray(TRAINING_DATA_GRAD["YIELD"]).reshape(-1,1).astype(np.float)
x = np.column_stack((rm, yield_val))
# Liear Regression
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(x, bagSold)
bagSold_prediction = regr.predict(x)
print("Coefficients = {}".format(regr.coef_))
print("Intercept = {}".format(regr.intercept_))
# Find MSE
mse = sklearn.metrics.mean_squared_error(bagSold, bagSold_prediction)
print("MSE = {}".format(mse))
In [16]:
bagSold = np.log(np.asarray(TRAINING_DATA_GRAD["BAGSOLD"]).reshape(-1, 1).astype(np.float))
rm = np.asarray(TRAINING_DATA_GRAD["RM"]).reshape(-1,1).astype(np.float)
yield_val = np.asarray(TRAINING_DATA_GRAD["YIELD"]).reshape(-1,1).astype(np.float)
x = np.column_stack((rm, yield_val))
# Liear Regression
regr = linear_model.Ridge(alpha=20000)
regr.fit(x, bagSold)
bagSold_prediction = regr.predict(x)
print("Coefficients = {}".format(regr.coef_))
print("Intercept = {}".format(regr.intercept_))
# Find MSE
mse = sklearn.metrics.mean_squared_error(bagSold, bagSold_prediction)
print("MSE = {}".format(mse))
In [ ]: