Homework 2 - Classification Dataset


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Importing the datasets


In [11]:
redSetPath = "classification/winequality-red.csv"
# whiteSetPath = "classification/winequality-white.csv"

In [12]:
#Reading in the raw data. Note that the features are seperated by ';' character 
redSet = pd.read_csv(redSetPath, sep=';')
# whiteSet = pd.read_csv(whiteSetPath, sep=';')

In [14]:
# redSet.drop(['index'], axis=1, inplace=True)
redSet.head()


Out[14]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5

In [15]:
# whiteSet.head()

Braking datasets into training and testing sets


In [16]:
#Breaking the datasets into 70% training and 30% testing
red_train, red_test  = train_test_split(redSet,test_size=0.30)
red_train, red_valid = train_test_split(red_train,test_size=0.20)

# white_train, white_test  = train_test_split(whiteSet,test_size=0.30)
# white_train, white_valid = train_test_split(white_train,test_size=0.20)

Saving the train and test datasets


In [17]:
# Red Wine
red_train_path = "classification/red_train.csv"
red_valid_path = "classification/red_valid.csv"
red_test_path  = "classification/red_test.csv"

# # White Wine
# white_train_path  = "classification/white_train.csv"
# white_valid_path  = "classification/white_valid.csv"
# white_test_path   = "classification/white_test.csv"

In [18]:
red_train.to_csv(path_or_buf=red_train_path, index=False)
red_valid.to_csv(path_or_buf=red_valid_path, index=False)
red_test.to_csv(path_or_buf=red_test_path, index=False)

# white_train.to_csv(path_or_buf=white_train_path, sep=';')
# white_valid.to_csv(path_or_buf=white_valid_path, sep=';')
# white_test.to_csv(path_or_buf=white_test_path, sep=';')

Checking the saved data and their shapes:


In [19]:
print 'Red Wine - Number of Instances Per Set'
print 'Training Set:   %d'%(len(red_train))
print 'Validation Set: %d'%(len(red_valid))
print 'Testing Set:    %d'%(len(red_test))

# print ''
# print ''

# print 'White Wine - Number of Instances Per Set'
# print 'Training Set:   %d'%(len(white_train))
# print 'Validation Set: %d'%(len(white_valid))
# print 'Testing Set:    %d'%(len(white_test))


Red Wine - Number of Instances Per Set
Training Set:   895
Validation Set: 224
Testing Set:    480

In [ ]: