In [1]:
!pip install numpy scipy scikit-learn pandas


Requirement already satisfied: numpy in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scipy in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: scikit-learn in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: pandas in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: python-dateutil>=2 in c:\users\kogentix\anaconda3\lib\site-packages (from pandas)
Requirement already satisfied: pytz>=2011k in c:\users\kogentix\anaconda3\lib\site-packages (from pandas)
Requirement already satisfied: six>=1.5 in c:\users\kogentix\anaconda3\lib\site-packages (from python-dateutil>=2->pandas)

In [2]:
!pip install deap update_checker tqdm stopit


Requirement already satisfied: deap in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: update_checker in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: tqdm in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: stopit in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: requests>=2.3.0 in c:\users\kogentix\anaconda3\lib\site-packages (from update_checker)

In [53]:
#Using the Python Library
import numpy as np
import xgboost as xgb

In [4]:
print(dir(xgb))


['Booster', 'DMatrix', 'VERSION_FILE', 'XGBClassifier', 'XGBModel', 'XGBRegressor', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'absolute_import', 'callback', 'compat', 'core', 'cv', 'f', 'libpath', 'os', 'plot_importance', 'plot_tree', 'plotting', 'rabit', 'sklearn', 'to_graphviz', 'train', 'training']

In [5]:
import sys
sys.path.append("C:\\Users\\KOGENTIX\\xgboost\\python-package\\")
print (sys.path)


['', 'C:\\Users\\KOGENTIX\\Anaconda3\\python36.zip', 'C:\\Users\\KOGENTIX\\Anaconda3\\DLLs', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib', 'C:\\Users\\KOGENTIX\\Anaconda3', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.6-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\xgboost-0.6-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\win32', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\win32\\lib', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\Pythonwin', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg', 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\KOGENTIX\\.ipython', 'C:\\Users\\KOGENTIX\\xgboost\\python-package\\']

In [37]:
from sklearn import datasets, neighbors, linear_model, tree, svm

In [18]:
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

n_samples = len(X_digits)
n_samples


Out[18]:
1797

In [19]:
X_train = X_digits[:int(.9 * n_samples)]
y_train = y_digits[:int(.9 * n_samples)]
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]

In [65]:
y_train


Out[65]:
array([0, 1, 2, ..., 5, 0, 9])

In [68]:
from scipy import stats

In [69]:
stats.itemfreq(y_train)


Out[69]:
array([[  0, 162],
       [  1, 163],
       [  2, 160],
       [  3, 165],
       [  4, 161],
       [  5, 164],
       [  6, 163],
       [  7, 160],
       [  8, 157],
       [  9, 162]], dtype=int64)

In [21]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.predict(X_test)


Out[21]:
array([5, 2, 2, 0, 1, 7, 6, 3, 2, 1, 5, 4, 3, 3, 1, 3, 7, 1, 3, 6, 1, 4, 3,
       1, 4, 0, 5, 3, 5, 9, 4, 1, 7, 5, 4, 4, 7, 2, 2, 5, 3, 8, 5, 9, 4, 7,
       0, 5, 7, 7, 0, 8, 2, 5, 4, 5, 6, 7, 2, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8,
       9, 0, 8, 2, 2, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, 9, 8, 9, 8, 4,
       1, 7, 7, 1, 5, 1, 0, 0, 9, 2, 7, 8, 2, 0, 1, 2, 6, 2, 2, 7, 4, 3, 4,
       6, 6, 6, 4, 9, 1, 6, 0, 9, 5, 2, 8, 1, 0, 0, 6, 7, 6, 1, 2, 1, 7, 4,
       6, 3, 1, 2, 9, 1, 7, 6, 8, 4, 5, 1, 4, 0, 5, 3, 6, 9, 6, 1, 7, 5, 4,
       4, 7, 2, 8, 2, 2, 5, 7, 9, 3, 4, 3, 2, 4, 9, 0, 8, 9, 8])

In [22]:
print(clf.score(X_test, y_test))


0.811111111111

In [23]:
xr = xgb.XGBRegressor()

In [24]:
xr=xr.fit(X_train, y_train)

In [25]:
xr.predict(X_test)


Out[25]:
array([ 5.17454863,  2.31009936,  5.6211586 ,  0.13941881,  0.79184723,
        6.69167852,  5.78008556,  2.75774169,  1.25420761,  3.88986874,
        6.53292084,  7.64552402,  6.02077436,  2.18998647,  1.34927988,
        3.54345942,  7.82803583,  0.77500153,  8.04337502,  5.97606659,
        6.33106804,  3.50779319,  3.05139947,  2.53856802,  4.477005  ,
       -0.28811425,  6.15758705,  1.73706651,  5.25047827,  8.17740345,
        4.87318468,  1.47395992,  4.23157883,  5.02434826,  4.23819637,
        4.44589853,  6.25152922,  2.46546197,  3.01612639,  4.97270441,
        6.26480484,  4.78373098,  4.82396936,  7.77662086,  4.53779554,
        2.62748647,  0.58428752,  7.43105268,  7.84978962,  6.68497181,
       -0.43701953,  1.69745815,  1.19588113,  4.55399942,  5.44729614,
        5.21953487,  5.15919018,  7.47001648,  5.57749367,  8.38232613,
        0.11813346,  1.01219177,  3.26156807,  3.34898233,  3.77228022,
        5.18798351,  5.56377363,  5.55503988,  5.69231129,  9.27871513,
        0.23553038,  2.98534942,  2.9672277 ,  4.37331295,  4.05083179,
        5.35860491,  4.50167513,  6.55852222,  5.99427176,  8.79152489,
       -0.0560739 ,  8.39695549,  4.91200256,  5.08682823,  5.07829237,
        5.55622244, -0.09274095,  8.56768799,  9.40511417,  7.96262836,
        7.87627077,  4.28712797,  1.99141049,  5.79908609,  6.9753952 ,
        5.89846325,  5.84728098,  2.30634046, -0.60442746,  0.28149834,
        1.9418658 ,  1.9422158 ,  6.34814787,  7.74735546,  2.57224393,
        0.31663024,  3.30871034,  3.34071231,  5.37906456,  5.26454973,
        4.34845161,  4.45209503,  4.11032057,  5.4872117 ,  4.70515633,
        4.50495052,  6.04063702,  4.17267513,  4.40274429,  7.88456392,
        3.16085935,  4.5046258 , -0.04774338,  7.89965773,  5.53477383,
        3.49594831,  5.54834604,  2.28996778, -0.36804682, -0.50901437,
        2.3560257 ,  5.90113878,  4.94886303,  4.04959488,  2.13626289,
        2.58525276,  5.28104973,  3.81460381,  5.87017059,  3.28680253,
        2.31958008,  3.75891876,  6.53239727,  1.89289927,  7.19561815,
        6.19357872,  8.21565628,  3.33273292,  4.72547579,  2.16792297,
        4.58852625,  1.53387964,  5.46388054,  3.475353  ,  4.73784971,
        6.14358616,  5.69037294,  2.51056719,  6.40560293,  5.5530138 ,
        4.09691381,  3.43549562,  7.43306684,  2.24002838,  3.57986665,
        2.04830194,  2.78988934,  5.89063787,  5.87044859,  5.56125641,
        5.92334414,  3.58165717,  4.03868723,  4.50158215,  4.0555582 ,
        8.14502335, -0.06778443,  7.20665073,  7.40505505,  7.43251753], dtype=float32)

In [26]:
print(xr.score(X_test, y_test))


0.76077616868

In [27]:
knn = neighbors.KNeighborsClassifier()

In [29]:
knn.fit(X_train, y_train).score(X_test, y_test)


Out[29]:
0.96111111111111114

In [30]:
logistic = linear_model.LogisticRegression()

In [31]:
logistic.fit(X_train, y_train).score(X_test, y_test)


Out[31]:
0.93888888888888888

In [38]:
clfs = svm.SVC()

In [39]:
clfs.fit(X_train, y_train).score(X_test, y_test)


Out[39]:
0.44444444444444442

In [56]:
from sklearn.metrics import accuracy_score, auc, roc_curve

In [49]:
accuracy_score(clf.predict(X_test), y_test)


Out[49]:
0.81111111111111112

In [59]:
fpr, tpr, thresholds =  roc_curve(clf.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)


Out[59]:
0.32077100115074797

In [50]:
accuracy_score(clfs.predict(X_test), y_test)


Out[50]:
0.44444444444444442

In [60]:
fpr, tpr, thresholds =  roc_curve(clfs.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)


Out[60]:
0.23428571428571426

In [51]:
accuracy_score(knn.predict(X_test), y_test)


Out[51]:
0.96111111111111114

In [61]:
fpr, tpr, thresholds =  roc_curve(knn.predict(X_test), y_test, pos_label=2)
auc(fpr, tpr)


Out[61]:
0.21896433470507543

In [52]:
accuracy_score(logistic.predict(X_test), y_test)


Out[52]:
0.93888888888888888

! pip install scikit-mdr skrebate

!pip install tpot


In [33]:
from tpot import TPOTClassifier


C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\tools\_hypervolume\pyhv.py:33: ImportWarning: Falling back to the python version of hypervolume module. Expect this to be very slow.
  "module. Expect this to be very slow.", ImportWarning)

In [34]:
pipeline_optimizer = TPOTClassifier()


C:\Users\KOGENTIX\Anaconda3\lib\importlib\_bootstrap.py:205: ImportWarning: can't resolve package from __spec__ or __package__, falling back on __name__ and __path__
  return f(*args, **kwds)

In [35]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)


C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\creator.py:141: RuntimeWarning: A class named 'FitnessMulti' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.
  RuntimeWarning)
C:\Users\KOGENTIX\Anaconda3\lib\site-packages\deap\creator.py:141: RuntimeWarning: A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.
  RuntimeWarning)

In [36]:
pipeline_optimizer.fit(X_train, y_train)


                                                                                                                       
Generation 1 - Current best internal CV score: 0.9382234388034776
                                                                                                                       
Generation 2 - Current best internal CV score: 0.9462601281990894
                                                                                                                       
Generation 3 - Current best internal CV score: 0.9629851932956182
                                                                                                                       
Generation 4 - Current best internal CV score: 0.9629851932956182
                                                                                                                       
Generation 5 - Current best internal CV score: 0.9647684149987755
                                                           
Best pipeline: KNeighborsClassifier(Normalizer(input_matrix, norm=max), n_neighbors=6, p=2, weights=distance)
Out[36]:
TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=20, periodic_checkpoint_folder=None,
        population_size=20, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [41]:
print(pipeline_optimizer.score(X_test, y_test))


0.961111111111

In [42]:
pipeline_optimizer.export('tpot_exported_pipeline.py')


Out[42]:
True

In [43]:
import os as os

In [44]:
os.getcwd()


Out[44]:
'C:\\Users\\KOGENTIX'

In [ ]:
# This is the exported pipeline

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#### NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

#### Score on the training set was:0.9647684149987755
exported_pipeline = make_pipeline(
    Normalizer(norm="max"),
    KNeighborsClassifier(n_neighbors=6, p=2, weights="distance")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [45]:
##tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2


  File "<ipython-input-45-19486b893ab8>", line 1
    tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2
            ^
SyntaxError: invalid syntax

In [ ]: