In [1]:

    
# import some basic libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import linear_model

%matplotlib inline



In [2]:

    
# IMPORT BASIC DATA
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
CONSIDERED = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] == "YES"]

sampleSize = 11932/258253
print(sampleSize)
print("Experiment_Data: {} \nEvaluation_Set: {} \
       \nGraduated_Set: {}".format(EXPERIMENT_DATA.shape, 
                                  EVALUATION_SET.shape,
                                  CONSIDERED.shape))
print("Experiment_Data Columns:\n{}".format(list(EXPERIMENT_DATA.columns.values)))
print("Evaluation_Set Columns:\n{}".format(list(EVALUATION_SET.columns.values)))









    



0.046202754663062964
Experiment_Data: (258253, 12) 
Evaluation_Set: (38, 4)        
Graduated_Set: (11932, 12)
Experiment_Data Columns:
['YEAR', 'EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO', 'YIELD', 'CLASS_OF', 'GRAD', 'BAGSOLD']
Evaluation_Set Columns:
['CLASS_OF', 'VARIETY', 'FAMILY', 'RM']

Comments

Notice that the graduated set that we are downsampling the data with only represents $4\%$ of the actual data set.



In [3]:

    
## PRINT BASIC DATA INFORMATION
print("Experiment_Data Features: \n{}".format(EXPERIMENT_DATA.dtypes))
print("\nEvaluation_Set Features:\n{}".format(EVALUATION_SET.dtypes))









    



Experiment_Data Features: 
YEAR            int64
EXPERIMENT     object
LOCATION       object
VARIETY        object
FAMILY         object
CHECK            bool
RM            float64
REPNO           int64
YIELD         float64
CLASS_OF       object
GRAD           object
BAGSOLD        object
dtype: object

Evaluation_Set Features:
CLASS_OF      int64
VARIETY      object
FAMILY       object
RM          float64
dtype: object



In [4]:

    
print("{} {}".format(EXPERIMENT_DATA.head(), EXPERIMENT_DATA.tail()))









    



   YEAR  EXPERIMENT LOCATION  VARIETY    FAMILY CHECK   RM  REPNO      YIELD  \
0  2009  09YT000052     3210  V000016  FAM05619  True  3.9      1  50.302327   
1  2009  09YT000052     3210  V000016  FAM05619  True  3.9      2  48.331305   
2  2009  09YT000052     3210  V000018  FAM05619  True  3.7      1  53.103246   
3  2009  09YT000052     3210  V000018  FAM05619  True  3.7      2  53.103246   
4  2009  09YT000052     3210  V000039  FAM05625  True  3.4      1  58.081136   

  CLASS_OF GRAD BAGSOLD  
0        .    .       .  
1        .    .       .  
2        .    .       .  
3        .    .       .  
4        .    .       .           YEAR  EXPERIMENT LOCATION  VARIETY    FAMILY CHECK   RM  REPNO  \
258248  2014  14YT005707     3490  V155853  FAM13521  True  2.7      2   
258249  2014  14YT005707     3490  V156314  FAM12531  True  2.8      1   
258250  2014  14YT005707     3490  V156314  FAM12531  True  2.8      2   
258251  2014  14YT005707     3490  V156553  FAM14238  True  2.9      1   
258252  2014  14YT005707     3490  V156553  FAM14238  True  2.9      2   

            YIELD CLASS_OF GRAD BAGSOLD  
258248  57.215105     2011  YES  761392  
258249  69.472023     2012  YES  755156  
258250  64.089668     2012  YES  755156  
258251  72.767815     2013  YES  595119  
258252  60.186732     2013  YES  595119



In [5]:

    
print("The number of unique locations that we have is {}."
      .format(len(set(EXPERIMENT_DATA['LOCATION']))))
print("The number of unique yields that we have is {}."
      .format(len(set(np.floor(EXPERIMENT_DATA['YIELD'])))))
print("The number of unique families is {}."
      .format(len(set(EXPERIMENT_DATA['YIELD']))))
print("The number of unique varieties is {}."
      .format(len(set(EXPERIMENT_DATA['VARIETY']))))









    



The number of unique locations that we have is 152.
The number of unique yields that we have is 107.
The number of unique families is 44851.
The number of unique varieties is 15632.



In [6]:

    
clean = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] != '.']
print(clean.shape)
clean.is_copy = False
# remove bags sold for now, we just want to predict graduation 
del clean["BAGSOLD"]
del clean["YEAR"]
print(clean.shape[0])









    



(21894, 12)
21894



In [7]:

    
var_dict = list(clean["VARIETY"].to_dict().values())
loc_dict = list(clean["LOCATION"].to_dict().values())
exp_dict = list(clean["EXPERIMENT"].to_dict().values())
family_dict = list(clean["FAMILY"].to_dict().values())



In [8]:

    
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

labeled_var = np.asarray(le.fit_transform(var_dict))
labeled_loc = np.asarray(le1.fit_transform(loc_dict))
labeled_exp = np.asarray(le2.fit_transform(exp_dict))
labeled_fam = np.asarray(le3.fit_transform(family_dict))



In [9]:

    
grad_idx = clean.columns.get_loc("GRAD")

clean["GRAD"] = clean["GRAD"].replace(["YES", "NO"], [1,0])
clean["CHECK"] = clean["CHECK"].astype(int)
clean["VARIETY"] = labeled_var
clean["LOCATION"] = labeled_loc
clean["EXPERIMENT"] = labeled_exp
clean["FAMILY"] = labeled_fam

# clean.to_csv("clean.csv")



In [10]:

    
print(clean.dtypes)
clean.head()









    



EXPERIMENT      int64
LOCATION        int64
VARIETY         int64
FAMILY          int64
CHECK           int32
RM            float64
REPNO           int64
YIELD         float64
CLASS_OF       object
GRAD            int64
dtype: object






    Out[10]:






  
    
      
      EXPERIMENT
      LOCATION
      VARIETY
      FAMILY
      CHECK
      RM
      REPNO
      YIELD
      CLASS_OF
      GRAD
    
  
  
    
      694
      95
      103
      15
      24
      0
      3.7
      1
      68.133720
      2011
      0
    
    
      695
      95
      103
      15
      24
      0
      3.7
      2
      74.836534
      2011
      0
    
    
      1263
      95
      103
      19
      1
      0
      3.8
      1
      48.027659
      2011
      0
    
    
      1264
      95
      103
      19
      1
      0
      3.8
      2
      52.497143
      2011
      0
    
    
      1379
      198
      48
      74
      29
      0
      2.1
      1
      61.924294
      2011
      1



In [20]:

    
# partition the clean data for 60/40 train test
samples = int(clean.shape[0] * 0.6)
features = clean.columns.values[:-1]
print(features)
train_x = clean[0:samples][features]
train_y = clean[0:samples]["GRAD"]
print(train_x.columns)
valid_x = clean[samples:][features]
valid_y = clean[samples:]["GRAD"]
print(valid_x.columns)
print(train_x.shape)









    



['EXPERIMENT' 'LOCATION' 'VARIETY' 'FAMILY' 'CHECK' 'RM' 'REPNO' 'YIELD'
 'CLASS_OF']
Index(['EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO',
       'YIELD', 'CLASS_OF'],
      dtype='object')
Index(['EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO',
       'YIELD', 'CLASS_OF'],
      dtype='object')
(13136, 9)



In [21]:

    
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, intercept_scaling=1, 
                           class_weight=None, random_state=None, solver='liblinear', max_iter=1000, 
                           multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

model.fit(train_x, train_y)









    Out[21]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [22]:

    
predictions = model.predict(valid_x)
print("Accuracy of the Model", np.sum(predictions == valid_y) / len(predictions))









    



Accuracy of the Model 0.770723909568



In [110]:

    
model.coef_t









    Out[110]:





array([[  2.52897726e-04,   1.62061048e-04,   6.49632083e-04,
         -3.60634169e-03,   2.15521097e+00,   2.33712788e-01,
         -4.83618616e-02,   2.92613909e-03,  -5.94767902e-04]])



In [ ]:

	EXPERIMENT	LOCATION	VARIETY	FAMILY	RM	REPNO	YIELD	CLASS_OF	GRAD
694	95	103	15	24	3.7	1	68.133720	2011	0
695	95	103	15	24	3.7	2	74.836534	2011	0
1263	95	103	19	1	3.8	1	48.027659	2011	0
1264	95	103	19	1	3.8	2	52.497143	2011	0
1379	198	48	74	29	2.1	1	61.924294	2011	1