分类问题给定的数据集中创建训练集和测试集
使用iris数据集
In [3]:
#
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np
def get_iris_data():
'''
return the Iris dataset
'''
data = load_iris()
x = data['data']
y = data['target']
# 合并x,y
input_dataset = np.column_stack([x,y])
np.random.shuffle(input_dataset)
return input_dataset
In [4]:
# 分割数据80%,20%
train_rate = 0.8
test_rate = 1-train_rate
input_dataset = get_iris_data()
train,test = train_test_split(input_dataset,test_size=test_rate)
# print
print 'origin dataset:',input_dataset.shape
print 'train ',train.shape
print 'test ',test.shape
分割数据后,需要检测训练集和测试集里的类别标签分布是否符合相应的比例
In [5]:
def get_class_distribution(y):
distribution = {}
set_y = set(y)
for y_label in set_y:
count = len(np.where(y == y_label)[0])
distribution[y_label] = count
dist_percentage = {class_label:count/(1.0*sum(distribution.values())) for class_label,count in distribution.items()}
return dist_percentage
In [6]:
# 打印输出
def print_class_label_split(train,test):
y_train = train[:,-1]
train_distribution = get_class_distribution(y_train)
print '\nTrain data set class label distribution'
for k,v in train_distribution.items():
print 'class label = %d,percentage records=%0.2f'%(k,v)
y_test = test[:,-1]
test_distribution = get_class_distribution(y_test)
print '\nTest data set class label distribution'
for k,v in test_distribution.items():
print 'class label = %d,percentage records =%0.2f'%(k,v)
print_class_label_split(train,test)
In [8]:
#在训练集和测试集均匀的分割
from sklearn.cross_validation import StratifiedShuffleSplit
stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],test_size = test_rate,n_iter=1)
for train_indx,test_indx in stratified_split:
train = input_dataset[train_indx]
test = input_dataset[test_indx]
print_class_label_split(train,test)