In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
In [11]:
redSetPath = "classification/winequality-red.csv"
# whiteSetPath = "classification/winequality-white.csv"
In [12]:
#Reading in the raw data. Note that the features are seperated by ';' character
redSet = pd.read_csv(redSetPath, sep=';')
# whiteSet = pd.read_csv(whiteSetPath, sep=';')
In [14]:
# redSet.drop(['index'], axis=1, inplace=True)
redSet.head()
Out[14]:
In [15]:
# whiteSet.head()
In [16]:
#Breaking the datasets into 70% training and 30% testing
red_train, red_test = train_test_split(redSet,test_size=0.30)
red_train, red_valid = train_test_split(red_train,test_size=0.20)
# white_train, white_test = train_test_split(whiteSet,test_size=0.30)
# white_train, white_valid = train_test_split(white_train,test_size=0.20)
In [17]:
# Red Wine
red_train_path = "classification/red_train.csv"
red_valid_path = "classification/red_valid.csv"
red_test_path = "classification/red_test.csv"
# # White Wine
# white_train_path = "classification/white_train.csv"
# white_valid_path = "classification/white_valid.csv"
# white_test_path = "classification/white_test.csv"
In [18]:
red_train.to_csv(path_or_buf=red_train_path, index=False)
red_valid.to_csv(path_or_buf=red_valid_path, index=False)
red_test.to_csv(path_or_buf=red_test_path, index=False)
# white_train.to_csv(path_or_buf=white_train_path, sep=';')
# white_valid.to_csv(path_or_buf=white_valid_path, sep=';')
# white_test.to_csv(path_or_buf=white_test_path, sep=';')
In [19]:
print 'Red Wine - Number of Instances Per Set'
print 'Training Set: %d'%(len(red_train))
print 'Validation Set: %d'%(len(red_valid))
print 'Testing Set: %d'%(len(red_test))
# print ''
# print ''
# print 'White Wine - Number of Instances Per Set'
# print 'Training Set: %d'%(len(white_train))
# print 'Validation Set: %d'%(len(white_valid))
# print 'Testing Set: %d'%(len(white_test))
In [ ]: