In [1]:
# import some basic libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import linear_model

%matplotlib inline

In [2]:
# IMPORT BASIC DATA
EXPERIMENT_DATA = pickle.load(open('EXPERIMENT_SET_pandas.pkl', 'rb'))
EVALUATION_SET = pickle.load(open('EVALUATION_SET_pandas.pkl', 'rb'))
CONSIDERED = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] == "YES"]

sampleSize = 11932/258253
print(sampleSize)
print("Experiment_Data: {} \nEvaluation_Set: {} \
       \nGraduated_Set: {}".format(EXPERIMENT_DATA.shape, 
                                  EVALUATION_SET.shape,
                                  CONSIDERED.shape))
print("Experiment_Data Columns:\n{}".format(list(EXPERIMENT_DATA.columns.values)))
print("Evaluation_Set Columns:\n{}".format(list(EVALUATION_SET.columns.values)))


0.046202754663062964
Experiment_Data: (258253, 12) 
Evaluation_Set: (38, 4)        
Graduated_Set: (11932, 12)
Experiment_Data Columns:
['YEAR', 'EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO', 'YIELD', 'CLASS_OF', 'GRAD', 'BAGSOLD']
Evaluation_Set Columns:
['CLASS_OF', 'VARIETY', 'FAMILY', 'RM']

Comments

Notice that the graduated set that we are downsampling the data with only represents $4\%$ of the actual data set.


In [3]:
## PRINT BASIC DATA INFORMATION
print("Experiment_Data Features: \n{}".format(EXPERIMENT_DATA.dtypes))
print("\nEvaluation_Set Features:\n{}".format(EVALUATION_SET.dtypes))


Experiment_Data Features: 
YEAR            int64
EXPERIMENT     object
LOCATION       object
VARIETY        object
FAMILY         object
CHECK            bool
RM            float64
REPNO           int64
YIELD         float64
CLASS_OF       object
GRAD           object
BAGSOLD        object
dtype: object

Evaluation_Set Features:
CLASS_OF      int64
VARIETY      object
FAMILY       object
RM          float64
dtype: object

In [4]:
print("{} {}".format(EXPERIMENT_DATA.head(), EXPERIMENT_DATA.tail()))


   YEAR  EXPERIMENT LOCATION  VARIETY    FAMILY CHECK   RM  REPNO      YIELD  \
0  2009  09YT000052     3210  V000016  FAM05619  True  3.9      1  50.302327   
1  2009  09YT000052     3210  V000016  FAM05619  True  3.9      2  48.331305   
2  2009  09YT000052     3210  V000018  FAM05619  True  3.7      1  53.103246   
3  2009  09YT000052     3210  V000018  FAM05619  True  3.7      2  53.103246   
4  2009  09YT000052     3210  V000039  FAM05625  True  3.4      1  58.081136   

  CLASS_OF GRAD BAGSOLD  
0        .    .       .  
1        .    .       .  
2        .    .       .  
3        .    .       .  
4        .    .       .           YEAR  EXPERIMENT LOCATION  VARIETY    FAMILY CHECK   RM  REPNO  \
258248  2014  14YT005707     3490  V155853  FAM13521  True  2.7      2   
258249  2014  14YT005707     3490  V156314  FAM12531  True  2.8      1   
258250  2014  14YT005707     3490  V156314  FAM12531  True  2.8      2   
258251  2014  14YT005707     3490  V156553  FAM14238  True  2.9      1   
258252  2014  14YT005707     3490  V156553  FAM14238  True  2.9      2   

            YIELD CLASS_OF GRAD BAGSOLD  
258248  57.215105     2011  YES  761392  
258249  69.472023     2012  YES  755156  
258250  64.089668     2012  YES  755156  
258251  72.767815     2013  YES  595119  
258252  60.186732     2013  YES  595119  

In [5]:
print("The number of unique locations that we have is {}."
      .format(len(set(EXPERIMENT_DATA['LOCATION']))))
print("The number of unique yields that we have is {}."
      .format(len(set(np.floor(EXPERIMENT_DATA['YIELD'])))))
print("The number of unique families is {}."
      .format(len(set(EXPERIMENT_DATA['YIELD']))))
print("The number of unique varieties is {}."
      .format(len(set(EXPERIMENT_DATA['VARIETY']))))


The number of unique locations that we have is 152.
The number of unique yields that we have is 107.
The number of unique families is 44851.
The number of unique varieties is 15632.

In [6]:
clean = EXPERIMENT_DATA[EXPERIMENT_DATA["GRAD"] != '.']
print(clean.shape)
clean.is_copy = False
# remove bags sold for now, we just want to predict graduation 
del clean["BAGSOLD"]
del clean["YEAR"]
print(clean.shape[0])


(21894, 12)
21894

In [7]:
var_dict = list(clean["VARIETY"].to_dict().values())
loc_dict = list(clean["LOCATION"].to_dict().values())
exp_dict = list(clean["EXPERIMENT"].to_dict().values())
family_dict = list(clean["FAMILY"].to_dict().values())

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

labeled_var = np.asarray(le.fit_transform(var_dict))
labeled_loc = np.asarray(le1.fit_transform(loc_dict))
labeled_exp = np.asarray(le2.fit_transform(exp_dict))
labeled_fam = np.asarray(le3.fit_transform(family_dict))

In [9]:
grad_idx = clean.columns.get_loc("GRAD")

clean["GRAD"] = clean["GRAD"].replace(["YES", "NO"], [1,0])
clean["CHECK"] = clean["CHECK"].astype(int)
clean["VARIETY"] = labeled_var
clean["LOCATION"] = labeled_loc
clean["EXPERIMENT"] = labeled_exp
clean["FAMILY"] = labeled_fam

# clean.to_csv("clean.csv")

In [10]:
print(clean.dtypes)
clean.head()


EXPERIMENT      int64
LOCATION        int64
VARIETY         int64
FAMILY          int64
CHECK           int32
RM            float64
REPNO           int64
YIELD         float64
CLASS_OF       object
GRAD            int64
dtype: object
Out[10]:
EXPERIMENT LOCATION VARIETY FAMILY CHECK RM REPNO YIELD CLASS_OF GRAD
694 95 103 15 24 0 3.7 1 68.133720 2011 0
695 95 103 15 24 0 3.7 2 74.836534 2011 0
1263 95 103 19 1 0 3.8 1 48.027659 2011 0
1264 95 103 19 1 0 3.8 2 52.497143 2011 0
1379 198 48 74 29 0 2.1 1 61.924294 2011 1

In [20]:
# partition the clean data for 60/40 train test
samples = int(clean.shape[0] * 0.6)
features = clean.columns.values[:-1]
print(features)
train_x = clean[0:samples][features]
train_y = clean[0:samples]["GRAD"]
print(train_x.columns)
valid_x = clean[samples:][features]
valid_y = clean[samples:]["GRAD"]
print(valid_x.columns)
print(train_x.shape)


['EXPERIMENT' 'LOCATION' 'VARIETY' 'FAMILY' 'CHECK' 'RM' 'REPNO' 'YIELD'
 'CLASS_OF']
Index(['EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO',
       'YIELD', 'CLASS_OF'],
      dtype='object')
Index(['EXPERIMENT', 'LOCATION', 'VARIETY', 'FAMILY', 'CHECK', 'RM', 'REPNO',
       'YIELD', 'CLASS_OF'],
      dtype='object')
(13136, 9)

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, intercept_scaling=1, 
                           class_weight=None, random_state=None, solver='liblinear', max_iter=1000, 
                           multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

model.fit(train_x, train_y)


Out[21]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
predictions = model.predict(valid_x)
print("Accuracy of the Model", np.sum(predictions == valid_y) / len(predictions))


Accuracy of the Model 0.770723909568

In [110]:
model.coef_t


Out[110]:
array([[  2.52897726e-04,   1.62061048e-04,   6.49632083e-04,
         -3.60634169e-03,   2.15521097e+00,   2.33712788e-01,
         -4.83618616e-02,   2.92613909e-03,  -5.94767902e-04]])

In [ ]: