In [3]:
# import some basic libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import linear_model

%matplotlib inline

In [4]:
# IMPORT BASIC DATA
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
CONSIDERED = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] == "YES"]

sampleSize = 11932/258253
print(sampleSize)
print("Experiment_Data: {} \nEvaluation_Set: {} \
       \nGraduated_Set: {}".format(EXPERIMENT_DATA.shape, 
                                  EVALUATION_SET.shape,
                                  CONSIDERED.shape))
print("Experiment_Data Columns:\n{}".format(list(EXPERIMENT_DATA.columns.values)))
print("Evaluation_Set Columns:\n{}".format(list(EVALUATION_SET.columns.values)))


0.046202754663062964
Experiment_Data: (258253, 12) 
Evaluation_Set: (38, 4)        
Graduated_Set: (11932, 12)
Experiment_Data Columns:
['YEAR', 'EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO', 'YIELD', 'CLASS_OF', 'GRAD', 'BAGSOLD']
Evaluation_Set Columns:
['CLASS_OF', 'VARIETY', 'FAMILY', 'RM']

In [5]:
clean = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] != '.']
print(clean.shape)
clean.is_copy = False
# remove bags sold for now, we just want to predict graduation 
# del clean["BAGSOLD"]
# del clean["YEAR"]
print(clean.shape[0])


(21894, 12)
21894

In [6]:
var_dict = list(clean["VARIETY"].to_dict().values())
loc_dict = list(clean["LOCATION"].to_dict().values())
exp_dict = list(clean["EXPERIMENT"].to_dict().values())
family_dict = list(clean["FAMILY"].to_dict().values())

var_dict_eval = list(EVALUATION_SET["VARIETY"].to_dict().values())
family_dict_eval = list(EVALUATION_SET["FAMILY"].to_dict().values())

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()
le4 = preprocessing.LabelEncoder()
le5 = preprocessing.LabelEncoder()

labeled_var = np.asarray(le.fit_transform(var_dict))
labeled_loc = np.asarray(le1.fit_transform(loc_dict))
labeled_exp = np.asarray(le2.fit_transform(exp_dict))
labeled_fam = np.asarray(le3.fit_transform(family_dict))

labeled_var_eval = np.asarray(le4.fit_transform(var_dict_eval))
labeled_fam_eval = np.asarray(le5.fit_transform(family_dict_eval))

In [8]:
grad_idx = clean.columns.get_loc("GRAD")

clean["GRAD"] = clean["GRAD"].replace(["YES", "NO"], [1,0])
clean["CHECK"] = clean["CHECK"].astype(int)
clean["VARIETY"] = labeled_var
clean["LOCATION"] = labeled_loc
clean["EXPERIMENT"] = labeled_exp
clean["FAMILY"] = labeled_fam
clean["CLASS_OF"] = clean["CLASS_OF"].astype(int)
clean["BAGSOLD"] = clean["BAGSOLD"].astype(int)

EVALUATION_SET["VARIETY"] = labeled_var_eval
EVALUATION_SET["FAMILY"] = labeled_fam_eval
# clean.to_csv("clean.csv")

In [9]:
print(clean.dtypes)
clean.head()


YEAR            int64
EXPERIMENT      int64
LOCATION        int64
VARIETY         int64
FAMILY          int64
CHECK           int32
RM            float64
REPNO           int64
YIELD         float64
CLASS_OF        int32
GRAD            int64
BAGSOLD         int32
dtype: object
Out[9]:
YEAR EXPERIMENT LOCATION VARIETY FAMILY CHECK RM REPNO YIELD CLASS_OF GRAD BAGSOLD
694 2009 95 103 15 24 0 3.7 1 68.133720 2011 0 0
695 2009 95 103 15 24 0 3.7 2 74.836534 2011 0 0
1263 2009 95 103 19 1 0 3.8 1 48.027659 2011 0 0
1264 2009 95 103 19 1 0 3.8 2 52.497143 2011 0 0
1379 2009 198 48 74 29 0 2.1 1 61.924294 2011 1 61161

In [20]:
# partition the clean data for 80/20 train test
samples = int(clean.shape[0] * 0.8)
features = clean.columns.values[:-1]
print(features)
train_x = clean[0:samples][features]
# del train_x["EXPERIMENT"]
# del train_x["LOCATION"]
# del train_x["CHECK"]
# del train_x["REPNO"]
# del train_x["YIELD"]
# del train_x["YEAR"]
del train_x["GRAD"]
train_y = clean[0:samples]["GRAD"]
valid_x = clean[samples:][features]
# del valid_x["EXPERIMENT"]
# del valid_x["LOCATION"]
# del valid_x["CHECK"]
# del valid_x["REPNO"]
# del valid_x["YIELD"]
# del valid_x["YEAR"]
del valid_x["GRAD"]
valid_y = clean[samples:]["GRAD"]

test_x = EVALUATION_SET
print(EVALUATION_SET.dtypes)

print(train_x.shape)
print(train_x.columns.values)


['YEAR' 'EXPERIMENT' 'LOCATION' 'VARIETY' 'FAMILY' 'CHECK' 'RM' 'REPNO'
 'YIELD' 'CLASS_OF' 'GRAD']
CLASS_OF      int64
VARIETY       int64
FAMILY        int64
RM          float64
dtype: object
(17515, 10)
['YEAR' 'EXPERIMENT' 'LOCATION' 'VARIETY' 'FAMILY' 'CHECK' 'RM' 'REPNO'
 'YIELD' 'CLASS_OF']

In [21]:
from sklearn.naive_bayes import BernoulliNB

print(train_x.columns.values)
gnb = BernoulliNB()
gnb.fit(train_x, train_y)
print(valid_x.shape, test_x.shape)
valid_x.columns.values


['YEAR' 'EXPERIMENT' 'LOCATION' 'VARIETY' 'FAMILY' 'CHECK' 'RM' 'REPNO'
 'YIELD' 'CLASS_OF']
(4379, 10) (38, 4)
Out[21]:
array(['YEAR', 'EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK',
       'RM', 'REPNO', 'YIELD', 'CLASS_OF'], dtype=object)

In [22]:
predictions = gnb.predict(valid_x)
print("Accuracy of the Model", np.sum(predictions == valid_y) / len(predictions))
# new_pred = gnb.predict(test_x)
# valid_x.shape
# print(new_pred)


Accuracy of the Model 0.758849052295

In [19]:
print(valid_x)


        YEAR  EXPERIMENT  LOCATION  VARIETY  FAMILY  CHECK   RM  REPNO  \
235042  2013         351        75       56       4      0  2.7      1   
235043  2013         351        75       56       4      0  2.3      1   
235044  2013         183        64       76      42      0  2.3      2   
235045  2013         183        64      100       8      0  2.4      1   
235048  2013         183        64      100       8      0  2.6      1   
235049  2013         351        75       75      38      0  2.6      2   
235122  2013         351        75       75      38      1  2.8      1   
235123  2013         351        75       84      27      1  2.8      2   
235124  2013         351        75       84      27      1  2.4      1   
235125  2013         351        75       91      67      1  2.4      2   
235126  2013         351        75       91      67      1  2.7      1   
235127  2013         351        75      105      15      1  2.7      2   
235128  2013         351        75      105      15      1  2.5      1   
235129  2013         351        79        4      30      1  2.5      2   
235206  2013          94       133       22      44      1  2.8      1   
235207  2013         351        79       48      55      1  2.8      2   
235208  2013         351        79       56       4      1  2.4      1   
235209  2013         283        83       75      38      1  2.4      2   
235210  2013         283        83       75      38      1  2.7      1   
235211  2013         351        79       75      38      1  2.7      2   
235212  2013         347        61       95      73      1  2.5      1   
235213  2013         351        79       91      67      1  2.5      2   
235290  2013         351        79      105      15      1  2.8      1   
235291  2013          82       110       67      74      1  2.8      2   
235292  2013          82       110       67      74      1  2.4      1   
235293  2013         265        53       75      38      1  2.4      2   
235294  2013         351        80        4      30      1  2.7      1   
235295  2013         351        80        4      30      1  2.7      2   
235296  2013         347        61       96      25      1  2.5      1   
235297  2013         183        80       76      42      1  2.5      2   
...      ...         ...       ...      ...     ...    ...  ...    ...   
258079  2014         198        46        4      30      1  2.7      1   
258080  2014         198        46        4      30      1  2.7      2   
258081  2014         198        46        5      31      1  2.8      1   
258082  2014         198        46        5      31      1  2.8      2   
258083  2014         198        46       13      23      1  2.9      1   
258084  2014         198        46       13      23      1  2.9      2   
258149  2014         231        89       74      29      1  3.0      1   
258150  2014          95        99       15      24      1  3.0      2   
258151  2014          95        99       15      24      1  2.7      1   
258152  2014          95        99       19       1      1  2.7      2   
258161  2014          95        99       19       1      1  3.0      1   
258162  2014         198        48        5      31      1  3.0      2   
258163  2014         198        46       74      29      1  2.7      1   
258164  2014         198        46       74      29      1  2.7      2   
258165  2014         198        46       75      38      1  2.8      1   
258166  2014         198        46       75      38      1  2.8      2   
258167  2014          95        99       23      45      1  2.9      1   
258168  2014          95        99       23      45      1  2.9      2   
258233  2014         198        46       80      29      1  3.0      1   
258234  2014         198        46       80      29      1  3.0      2   
258235  2014         198        48       13      23      1  2.7      1   
258236  2014         198        48       13      23      1  2.7      2   
258245  2014          95        99       82      40      1  3.0      1   
258246  2014          95        99       82      40      1  3.0      2   
258247  2014         198        48        4      30      1  2.7      1   
258248  2014         198        48        4      30      1  2.7      2   
258249  2014         309       103       22      44      1  2.8      1   
258250  2014         309       103       22      44      1  2.8      2   
258251  2014         309       103       82      40      1  2.9      1   
258252  2014         309       103       82      40      1  2.9      2   

            YIELD  CLASS_OF  GRAD  
235042  66.124334      2013     0  
235043  63.331891      2013     0  
235044  63.107800      2013     0  
235045  65.341278      2013     0  
235048  75.171407      2013     1  
235049  74.947315      2013     1  
235122  66.019500      2011     1  
235123  63.405200      2011     1  
235124  70.375377      2011     1  
235125  62.630472      2011     1  
235126  57.984481      2011     1  
235127  58.565007      2011     1  
235128  60.598780      2012     1  
235129  61.954034      2012     1  
235206  60.404578      2011     1  
235207  57.984481      2011     1  
235208  60.308517      2011     1  
235209  64.278071      2011     1  
235210  65.050569      2011     1  
235211  63.695463      2011     1  
235212  60.018403      2012     1  
235213  51.789033      2012     1  
235290  60.260338      2011     1  
235291  59.638325      2011     1  
235292  57.770204      2011     1  
235293  62.334261      2011     1  
235294  58.703149      2011     1  
235295  53.828159      2011     1  
235296  58.703149      2012     1  
235297  52.481381      2012     1  
...           ...       ...   ...  
258079  52.049258      2011     1  
258080  54.424894      2011     1  
258081  51.607916      2012     1  
258082  57.927975      2012     1  
258083  65.409680      2013     1  
258084  55.609440      2013     1  
258149  61.333806      2013     1  
258150  61.780947      2013     1  
258151  61.613065      2013     1  
258152  65.416074      2013     1  
258161  60.052158      2011     1  
258162  40.493760      2011     1  
258163  64.391827      2011     1  
258164  56.159036      2011     1  
258165  61.524142      2012     1  
258166  64.524616      2012     1  
258167  57.146852      2013     1  
258168  64.630342      2013     1  
258233  62.554486      2013     1  
258234  59.494383      2013     1  
258235  50.660992      2013     1  
258236  64.556289      2013     1  
258245  73.762174      2011     1  
258246  58.520843      2011     1  
258247  57.359940      2011     1  
258248  57.215105      2011     1  
258249  69.472023      2012     1  
258250  64.089668      2012     1  
258251  72.767815      2013     1  
258252  60.186732      2013     1  

[4379 rows x 11 columns]

In [ ]: