Sklearn

sklearn.cross_validation


In [1]:
from sklearn import cross_validation, datasets

import numpy as np


/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Разовое разбиение данных на обучение и тест с помощью train_test_split


In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)


Out[4]:
0.3

In [5]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))


Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов

In [6]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]


Обучающая выборка:
[[ 6.7  3.1  5.6  2.4]
 [ 5.   3.2  1.2  0.2]
 [ 6.3  3.3  4.7  1.6]
 [ 6.4  2.8  5.6  2.1]
 [ 5.5  3.5  1.3  0.2]]


Тестовая выборка:
[[ 5.1  3.3  1.7  0.5]
 [ 4.7  3.2  1.3  0.2]
 [ 5.9  3.   5.1  1.8]
 [ 5.1  3.8  1.6  0.2]
 [ 6.9  3.1  5.4  2.1]]

In [7]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels


Метки классов на обучающей выборке:
[2 0 1 2 0 2 2 2 0 2 1 1 0 2 0 1 1 0 0 2 1 2 0 1 0 2 2 1 2 0 1 0 0 1 0 2 0
 0 1 2 1 1 1 2 2 2 1 1 1 2 2 2 0 0 1 2 1 1 2 2 0 0 1 0 1 2 1 1 0 1 0 0 2 2
 0 1 1 2 2 1 0 2 2 1 2 0 0 2 2 1 0 0 2 1 0 1 1 2 2 2 2 2 0 0 2]


Метки классов на тестовой выборке:
[0 0 2 0 2 0 2 0 1 1 0 1 1 0 2 1 0 2 0 0 0 1 1 2 1 1 0 2 1 2 1 0 0 0 1 1 1
 0 1 2 1 0 1 0 2]

Стратегии проведения кросс-валидации

KFold


In [8]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):
    print train_indices, test_indices


[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]

In [9]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):
    print train_indices, test_indices


[1 2 4 7 9] [0 3 5 6 8]
[0 3 5 6 8] [1 2 4 7 9]

In [10]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):
    print train_indices, test_indices


[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]

StratifiedKFold


In [11]:
target = np.array([0] * 5 + [1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):
    print train_indices, test_indices


[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]

In [12]:
target = np.array([0, 1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):
    print train_indices, test_indices


[0 1 0 1 0 1 0 1 0 1]
[0 3 7 8] [1 2 4 5 6 9]
[1 2 4 5 6 9] [0 3 7 8]

ShuffleSplit


In [19]:
for train_indices, test_indices in cross_validation.ShuffleSplit(10, n_iter = 10, test_size = 0.2):
    print train_indices, test_indices


[3 4 9 5 2 6 7 8] [1 0]
[6 9 8 0 5 2 1 4] [3 7]
[2 1 7 5 3 4 9 6] [8 0]
[2 0 8 3 7 6 9 4] [1 5]
[8 7 3 2 1 9 0 5] [4 6]
[9 8 2 4 0 3 1 5] [6 7]
[6 7 4 2 5 9 8 0] [3 1]
[5 7 9 3 6 0 8 2] [1 4]
[7 3 2 0 9 8 4 1] [5 6]
[6 9 5 7 2 1 4 0] [3 8]
[6 8 9 7 2 5 0 4] [3 1]
[4 1 3 9 8 0 6 2] [5 7]
[4 7 2 5 3 0 8 9] [1 6]
[8 3 2 5 6 1 4 0] [7 9]
[1 0 2 5 7 8 4 3] [6 9]
[5 7 8 6 9 0 4 1] [2 3]
[8 5 6 9 0 7 3 4] [2 1]
[3 1 9 2 7 4 6 0] [5 8]
[9 8 1 0 3 2 5 7] [6 4]
[8 3 1 0 9 7 5 6] [2 4]

StratifiedShuffleSplit


In [14]:
target = np.array([0] * 5 + [1] * 5)
print target
for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):
    print train_indices, test_indices


[0 0 0 0 0 1 1 1 1 1]
[0 8 4 6 7 1 9 2] [3 5]
[4 3 9 2 5 0 6 7] [8 1]
[9 6 2 7 1 5 3 4] [8 0]
[0 4 7 6 8 2 9 1] [3 5]

Leave-One-Out


In [15]:
for train_indices, test_index in cross_validation.LeaveOneOut(10):
    print train_indices, test_index


[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]

Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators