In [1]:
# import some basic libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import linear_model
%matplotlib inline
In [2]:
# IMPORT BASIC DATA
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
CONSIDERED = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] == "YES"]
sampleSize = 11932/258253
print(sampleSize)
print("Experiment_Data: {} \nEvaluation_Set: {} \
\nGraduated_Set: {}".format(EXPERIMENT_DATA.shape,
EVALUATION_SET.shape,
CONSIDERED.shape))
print("Experiment_Data Columns:\n{}".format(list(EXPERIMENT_DATA.columns.values)))
print("Evaluation_Set Columns:\n{}".format(list(EVALUATION_SET.columns.values)))
In [3]:
## PRINT BASIC DATA INFORMATION
print("Experiment_Data Features: \n{}".format(EXPERIMENT_DATA.dtypes))
print("\nEvaluation_Set Features:\n{}".format(EVALUATION_SET.dtypes))
In [4]:
print("{} {}".format(EXPERIMENT_DATA.head(), EXPERIMENT_DATA.tail()))
In [5]:
print("The number of unique locations that we have is {}."
.format(len(set(EXPERIMENT_DATA['LOCATION']))))
print("The number of unique yields that we have is {}."
.format(len(set(np.floor(EXPERIMENT_DATA['YIELD'])))))
print("The number of unique families is {}."
.format(len(set(EXPERIMENT_DATA['YIELD']))))
print("The number of unique varieties is {}."
.format(len(set(EXPERIMENT_DATA['VARIETY']))))
In [6]:
clean = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] != '.']
print(clean.shape)
clean.is_copy = False
# remove bags sold for now, we just want to predict graduation
del clean["BAGSOLD"]
del clean["YEAR"]
print(clean.shape[0])
In [7]:
var_dict = list(clean["VARIETY"].to_dict().values())
loc_dict = list(clean["LOCATION"].to_dict().values())
exp_dict = list(clean["EXPERIMENT"].to_dict().values())
family_dict = list(clean["FAMILY"].to_dict().values())
In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()
labeled_var = np.asarray(le.fit_transform(var_dict))
labeled_loc = np.asarray(le1.fit_transform(loc_dict))
labeled_exp = np.asarray(le2.fit_transform(exp_dict))
labeled_fam = np.asarray(le3.fit_transform(family_dict))
In [9]:
grad_idx = clean.columns.get_loc("GRAD")
clean["GRAD"] = clean["GRAD"].replace(["YES", "NO"], [1,0])
clean["CHECK"] = clean["CHECK"].astype(int)
clean["VARIETY"] = labeled_var
clean["LOCATION"] = labeled_loc
clean["EXPERIMENT"] = labeled_exp
clean["FAMILY"] = labeled_fam
# clean.to_csv("clean.csv")
In [10]:
print(clean.dtypes)
clean.head()
Out[10]:
In [20]:
# partition the clean data for 60/40 train test
samples = int(clean.shape[0] * 0.6)
features = clean.columns.values[:-1]
print(features)
train_x = clean[0:samples][features]
train_y = clean[0:samples]["GRAD"]
print(train_x.columns)
valid_x = clean[samples:][features]
valid_y = clean[samples:]["GRAD"]
print(valid_x.columns)
print(train_x.shape)
In [21]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, intercept_scaling=1,
class_weight=None, random_state=None, solver='liblinear', max_iter=1000,
multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
model.fit(train_x, train_y)
Out[21]:
In [22]:
predictions = model.predict(valid_x)
print("Accuracy of the Model", np.sum(predictions == valid_y) / len(predictions))
In [110]:
model.coef_t
Out[110]:
In [ ]: