In [3]:
# import some basic libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import linear_model
%matplotlib inline
In [4]:
# IMPORT BASIC DATA
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
CONSIDERED = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] == "YES"]
sampleSize = 11932/258253
print(sampleSize)
print("Experiment_Data: {} \nEvaluation_Set: {} \
\nGraduated_Set: {}".format(EXPERIMENT_DATA.shape,
EVALUATION_SET.shape,
CONSIDERED.shape))
print("Experiment_Data Columns:\n{}".format(list(EXPERIMENT_DATA.columns.values)))
print("Evaluation_Set Columns:\n{}".format(list(EVALUATION_SET.columns.values)))
In [5]:
clean = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] != '.']
print(clean.shape)
clean.is_copy = False
# remove bags sold for now, we just want to predict graduation
# del clean["BAGSOLD"]
# del clean["YEAR"]
print(clean.shape[0])
In [6]:
var_dict = list(clean["VARIETY"].to_dict().values())
loc_dict = list(clean["LOCATION"].to_dict().values())
exp_dict = list(clean["EXPERIMENT"].to_dict().values())
family_dict = list(clean["FAMILY"].to_dict().values())
var_dict_eval = list(EVALUATION_SET["VARIETY"].to_dict().values())
family_dict_eval = list(EVALUATION_SET["FAMILY"].to_dict().values())
In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()
le4 = preprocessing.LabelEncoder()
le5 = preprocessing.LabelEncoder()
labeled_var = np.asarray(le.fit_transform(var_dict))
labeled_loc = np.asarray(le1.fit_transform(loc_dict))
labeled_exp = np.asarray(le2.fit_transform(exp_dict))
labeled_fam = np.asarray(le3.fit_transform(family_dict))
labeled_var_eval = np.asarray(le4.fit_transform(var_dict_eval))
labeled_fam_eval = np.asarray(le5.fit_transform(family_dict_eval))
In [8]:
grad_idx = clean.columns.get_loc("GRAD")
clean["GRAD"] = clean["GRAD"].replace(["YES", "NO"], [1,0])
clean["CHECK"] = clean["CHECK"].astype(int)
clean["VARIETY"] = labeled_var
clean["LOCATION"] = labeled_loc
clean["EXPERIMENT"] = labeled_exp
clean["FAMILY"] = labeled_fam
clean["CLASS_OF"] = clean["CLASS_OF"].astype(int)
clean["BAGSOLD"] = clean["BAGSOLD"].astype(int)
EVALUATION_SET["VARIETY"] = labeled_var_eval
EVALUATION_SET["FAMILY"] = labeled_fam_eval
# clean.to_csv("clean.csv")
In [9]:
print(clean.dtypes)
clean.head()
Out[9]:
In [20]:
# partition the clean data for 80/20 train test
samples = int(clean.shape[0] * 0.8)
features = clean.columns.values[:-1]
print(features)
train_x = clean[0:samples][features]
# del train_x["EXPERIMENT"]
# del train_x["LOCATION"]
# del train_x["CHECK"]
# del train_x["REPNO"]
# del train_x["YIELD"]
# del train_x["YEAR"]
del train_x["GRAD"]
train_y = clean[0:samples]["GRAD"]
valid_x = clean[samples:][features]
# del valid_x["EXPERIMENT"]
# del valid_x["LOCATION"]
# del valid_x["CHECK"]
# del valid_x["REPNO"]
# del valid_x["YIELD"]
# del valid_x["YEAR"]
del valid_x["GRAD"]
valid_y = clean[samples:]["GRAD"]
test_x = EVALUATION_SET
print(EVALUATION_SET.dtypes)
print(train_x.shape)
print(train_x.columns.values)
In [21]:
from sklearn.naive_bayes import BernoulliNB
print(train_x.columns.values)
gnb = BernoulliNB()
gnb.fit(train_x, train_y)
print(valid_x.shape, test_x.shape)
valid_x.columns.values
Out[21]:
In [22]:
predictions = gnb.predict(valid_x)
print("Accuracy of the Model", np.sum(predictions == valid_y) / len(predictions))
# new_pred = gnb.predict(test_x)
# valid_x.shape
# print(new_pred)
In [19]:
print(valid_x)
In [ ]: