分类问题给定的数据集中创建训练集和测试集

使用iris数据集


In [3]:
# 
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np

def get_iris_data():
    '''
    return the Iris dataset
    '''
    data = load_iris()
    x = data['data']
    y = data['target']
    
    # 合并x,y
    input_dataset = np.column_stack([x,y])
    np.random.shuffle(input_dataset)
    return input_dataset

In [4]:
# 分割数据80%,20%
train_rate = 0.8
test_rate = 1-train_rate

input_dataset = get_iris_data()
train,test = train_test_split(input_dataset,test_size=test_rate)

# print
print 'origin dataset:',input_dataset.shape
print 'train ',train.shape
print 'test ',test.shape


origin dataset: (150, 5)
train  (120, 5)
test  (30, 5)

分割数据后,需要检测训练集和测试集里的类别标签分布是否符合相应的比例


In [5]:
def get_class_distribution(y):
    distribution = {}
    set_y = set(y)
    for y_label in set_y:
        count = len(np.where(y == y_label)[0])
        distribution[y_label] = count
    dist_percentage = {class_label:count/(1.0*sum(distribution.values())) for class_label,count in distribution.items()}
    return dist_percentage

In [6]:
# 打印输出
def print_class_label_split(train,test):
    y_train = train[:,-1]
    train_distribution = get_class_distribution(y_train)
    print '\nTrain data set class label distribution'
    for k,v in train_distribution.items():
        print 'class label = %d,percentage records=%0.2f'%(k,v)
        
    y_test = test[:,-1]
    test_distribution = get_class_distribution(y_test)
    print '\nTest data set class label distribution'
    for k,v in test_distribution.items():
        print 'class label = %d,percentage records =%0.2f'%(k,v)
print_class_label_split(train,test)


Train data set class label distribution
class label = 0,percentage records=0.33
class label = 1,percentage records=0.34
class label = 2,percentage records=0.33

Test data set class label distribution
class label = 0,percentage records =0.33
class label = 1,percentage records =0.30
class label = 2,percentage records =0.37

In [8]:
#在训练集和测试集均匀的分割
from sklearn.cross_validation import StratifiedShuffleSplit
stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],test_size = test_rate,n_iter=1)
for train_indx,test_indx in stratified_split:
    train = input_dataset[train_indx]
    test = input_dataset[test_indx]
    print_class_label_split(train,test)


Train data set class label distribution
class label = 0,percentage records=0.33
class label = 1,percentage records=0.33
class label = 2,percentage records=0.33

Test data set class label distribution
class label = 0,percentage records =0.33
class label = 1,percentage records =0.33
class label = 2,percentage records =0.33