notebook.community

Edit and run



In [1]:

    
!pip install numpy scipy scikit-learn pandas









    



Requirement already satisfied: numpy in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scipy in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scikit-learn in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: pandas in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: python-dateutil>=2 in c:\users\kogentix\anaconda3\lib\site-packages (from pandas)
Requirement already satisfied: pytz>=2011k in c:\users\kogentix\anaconda3\lib\site-packages (from pandas)
Requirement already satisfied: six>=1.5 in c:\users\kogentix\anaconda3\lib\site-packages (from python-dateutil>=2->pandas)



In [2]:

    
!pip install deap update_checker tqdm stopit









    



Requirement already satisfied: deap in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: update_checker in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: tqdm in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: stopit in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: requests>=2.3.0 in c:\users\kogentix\anaconda3\lib\site-packages (from update_checker)



In [53]:

    
#Using the Python Library
import numpy as np
import xgboost as xgb



In [4]:

    
print(dir(xgb))









    



['Booster', 'DMatrix', 'VERSION_FILE', 'XGBClassifier', 'XGBModel', 'XGBRegressor', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'absolute_import', 'callback', 'compat', 'core', 'cv', 'f', 'libpath', 'os', 'plot_importance', 'plot_tree', 'plotting', 'rabit', 'sklearn', 'to_graphviz', 'train', 'training']



In [5]:

    
import sys
sys.path.append("C:\\Users\\KOGENTIX\\xgboost\\python-package\\")
print (sys.path)









    



['', 'C:\\Users\\KOGENTIX\\Anaconda3\\python36.zip', 'C:\\Users\\KOGENTIX\\Anaconda3\\DLLs', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib', 'C:\\Users\\KOGENTIX\\Anaconda3', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.6-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\xgboost-0.6-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\win32', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\win32\\lib', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\Pythonwin', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\KOGENTIX\\.ipython', 'C:\\Users\\KOGENTIX\\xgboost\\python-package\\']



In [37]:

    
from sklearn import datasets, neighbors, linear_model, tree, svm



In [18]:

    
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

n_samples = len(X_digits)
n_samples









    Out[18]:





1797



In [19]:

    
X_train = X_digits[:int(.9 * n_samples)]
y_train = y_digits[:int(.9 * n_samples)]
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]



In [65]:

    
y_train









    Out[65]:





array([0, 1, 2, ..., 5, 0, 9])



In [68]:

    
from scipy import stats



In [69]:

    
stats.itemfreq(y_train)









    Out[69]:





array([[  0, 162],
       [  1, 163],
       [  2, 160],
       [  3, 165],
       [  4, 161],
       [  5, 164],
       [  6, 163],
       [  7, 160],
       [  8, 157],
       [  9, 162]], dtype=int64)



In [21]:

    
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.predict(X_test)









    Out[21]:





array([5, 2, 2, 0, 1, 7, 6, 3, 2, 1, 5, 4, 3, 3, 1, 3, 7, 1, 3, 6, 1, 4, 3,
       1, 4, 0, 5, 3, 5, 9, 4, 1, 7, 5, 4, 4, 7, 2, 2, 5, 3, 8, 5, 9, 4, 7,
       0, 5, 7, 7, 0, 8, 2, 5, 4, 5, 6, 7, 2, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8,
       9, 0, 8, 2, 2, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, 9, 8, 9, 8, 4,
       1, 7, 7, 1, 5, 1, 0, 0, 9, 2, 7, 8, 2, 0, 1, 2, 6, 2, 2, 7, 4, 3, 4,
       6, 6, 6, 4, 9, 1, 6, 0, 9, 5, 2, 8, 1, 0, 0, 6, 7, 6, 1, 2, 1, 7, 4,
       6, 3, 1, 2, 9, 1, 7, 6, 8, 4, 5, 1, 4, 0, 5, 3, 6, 9, 6, 1, 7, 5, 4,
       4, 7, 2, 8, 2, 2, 5, 7, 9, 3, 4, 3, 2, 4, 9, 0, 8, 9, 8])



In [22]:

    
print(clf.score(X_test, y_test))









    



0.811111111111



In [23]:

    
xr = xgb.XGBRegressor()



In [24]:

    
xr=xr.fit(X_train, y_train)



In [25]:

    
xr.predict(X_test)









    Out[25]:





array([ 5.17454863,  2.31009936,  5.6211586 ,  0.13941881,  0.79184723,
        6.69167852,  5.78008556,  2.75774169,  1.25420761,  3.88986874,
        6.53292084,  7.64552402,  6.02077436,  2.18998647,  1.34927988,
        3.54345942,  7.82803583,  0.77500153,  8.04337502,  5.97606659,
        6.33106804,  3.50779319,  3.05139947,  2.53856802,  4.477005  ,
       -0.28811425,  6.15758705,  1.73706651,  5.25047827,  8.17740345,
        4.87318468,  1.47395992,  4.23157883,  5.02434826,  4.23819637,
        4.44589853,  6.25152922,  2.46546197,  3.01612639,  4.97270441,
        6.26480484,  4.78373098,  4.82396936,  7.77662086,  4.53779554,
        2.62748647,  0.58428752,  7.43105268,  7.84978962,  6.68497181,
       -0.43701953,  1.69745815,  1.19588113,  4.55399942,  5.44729614,
        5.21953487,  5.15919018,  7.47001648,  5.57749367,  8.38232613,
        0.11813346,  1.01219177,  3.26156807,  3.34898233,  3.77228022,
        5.18798351,  5.56377363,  5.55503988,  5.69231129,  9.27871513,
        0.23553038,  2.98534942,  2.9672277 ,  4.37331295,  4.05083179,
        5.35860491,  4.50167513,  6.55852222,  5.99427176,  8.79152489,
       -0.0560739 ,  8.39695549,  4.91200256,  5.08682823,  5.07829237,
        5.55622244, -0.09274095,  8.56768799,  9.40511417,  7.96262836,
        7.87627077,  4.28712797,  1.99141049,  5.79908609,  6.9753952 ,
        5.89846325,  5.84728098,  2.30634046, -0.60442746,  0.28149834,
        1.9418658 ,  1.9422158 ,  6.34814787,  7.74735546,  2.57224393,
        0.31663024,  3.30871034,  3.34071231,  5.37906456,  5.26454973,
        4.34845161,  4.45209503,  4.11032057,  5.4872117 ,  4.70515633,
        4.50495052,  6.04063702,  4.17267513,  4.40274429,  7.88456392,
        3.16085935,  4.5046258 , -0.04774338,  7.89965773,  5.53477383,
        3.49594831,  5.54834604,  2.28996778, -0.36804682, -0.50901437,
        2.3560257 ,  5.90113878,  4.94886303,  4.04959488,  2.13626289,
        2.58525276,  5.28104973,  3.81460381,  5.87017059,  3.28680253,
        2.31958008,  3.75891876,  6.53239727,  1.89289927,  7.19561815,
        6.19357872,  8.21565628,  3.33273292,  4.72547579,  2.16792297,
        4.58852625,  1.53387964,  5.46388054,  3.475353  ,  4.73784971,
        6.14358616,  5.69037294,  2.51056719,  6.40560293,  5.5530138 ,
        4.09691381,  3.43549562,  7.43306684,  2.24002838,  3.57986665,
        2.04830194,  2.78988934,  5.89063787,  5.87044859,  5.56125641,
        5.92334414,  3.58165717,  4.03868723,  4.50158215,  4.0555582 ,
        8.14502335, -0.06778443,  7.20665073,  7.40505505,  7.43251753], dtype=float32)



In [26]:

    
print(xr.score(X_test, y_test))









    



0.76077616868



In [27]:

    
knn = neighbors.KNeighborsClassifier()



In [29]:

    
knn.fit(X_train, y_train).score(X_test, y_test)









    Out[29]:





0.96111111111111114



In [30]:

    
logistic = linear_model.LogisticRegression()



In [31]:

    
logistic.fit(X_train, y_train).score(X_test, y_test)









    Out[31]:





0.93888888888888888



In [38]:

    
clfs = svm.SVC()



In [39]:

    
clfs.fit(X_train, y_train).score(X_test, y_test)









    Out[39]:





0.44444444444444442



In [56]:

    
from sklearn.metrics import accuracy_score, auc, roc_curve



In [49]:

    
accuracy_score(clf.predict(X_test), y_test)









    Out[49]:





0.81111111111111112



In [59]:

    
fpr, tpr, thresholds =  roc_curve(clf.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)









    Out[59]:





0.32077100115074797



In [50]:

    
accuracy_score(clfs.predict(X_test), y_test)









    Out[50]:





0.44444444444444442



In [60]:

    
fpr, tpr, thresholds =  roc_curve(clfs.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)









    Out[60]:





0.23428571428571426



In [51]:

    
accuracy_score(knn.predict(X_test), y_test)









    Out[51]:





0.96111111111111114



In [61]:

    
fpr, tpr, thresholds =  roc_curve(knn.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)









    Out[61]:





0.21896433470507543



In [52]:

    
accuracy_score(logistic.predict(X_test), y_test)









    Out[52]:





0.93888888888888888

! pip install scikit-mdr skrebate

!pip install tpot



In [33]:

    
from tpot import TPOTClassifier









    



C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\tools\_hypervolume\pyhv.py:33: ImportWarning: Falling back to the python version of hypervolume module. Expect this to be very slow.
  "module. Expect this to be very slow.", ImportWarning)



In [34]:

    
pipeline_optimizer = TPOTClassifier()









    



C:\Users\KOGENTIX\Anaconda3\lib\importlib\_bootstrap.py:205: ImportWarning: can't resolve package from __spec__ or __package__, falling back on __name__ and __path__
  return f(*args, **kwds)



In [35]:

    
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)









    



C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\creator.py:141: RuntimeWarning: A class named 'FitnessMulti' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.
  RuntimeWarning)
C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\creator.py:141: RuntimeWarning: A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.
  RuntimeWarning)



In [36]:

    
pipeline_optimizer.fit(X_train, y_train)









    



                                                                                                                       






    



Generation 1 - Current best internal CV score: 0.9382234388034776






    



                                                                                                                       






    



Generation 2 - Current best internal CV score: 0.9462601281990894






    



                                                                                                                       






    



Generation 3 - Current best internal CV score: 0.9629851932956182






    



                                                                                                                       






    



Generation 4 - Current best internal CV score: 0.9629851932956182






    



                                                                                                                       






    



Generation 5 - Current best internal CV score: 0.9647684149987755






    



                                                           






    



Best pipeline: KNeighborsClassifier(Normalizer(input_matrix, norm=max), n_neighbors=6, p=2, weights=distance)






    Out[36]:





TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=20, periodic_checkpoint_folder=None,
        population_size=20, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)



In [41]:

    
print(pipeline_optimizer.score(X_test, y_test))









    



0.961111111111



In [42]:

    
pipeline_optimizer.export('tpot_exported_pipeline.py')









    Out[42]:





True



In [43]:

    
import os as os



In [44]:

    
os.getcwd()









    Out[44]:





'C:\\Users\\KOGENTIX'



In [ ]:

    
# This is the exported pipeline

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#### NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

#### Score on the training set was:0.9647684149987755
exported_pipeline = make_pipeline(
    Normalizer(norm="max"),
    KNeighborsClassifier(n_neighbors=6, p=2, weights="distance")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)



In [45]:

    
##tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2









    



  File "<ipython-input-45-19486b893ab8>", line 1
    tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2
            ^
SyntaxError: invalid syntax



In [ ]: