In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
In [2]:
# Basic IO
filename = 'log_reg/parkinsons.data'
dataset = pd.read_csv(filename)
# Randomize Dataset
dataset = dataset.sample(frac=1,random_state=32).reset_index()
dataset.head()
Out[2]:
In [3]:
dataset.columns
Out[3]:
In [4]:
dataset.drop(['index','name'], axis=1, inplace=True)
dataset.head()
Out[4]:
In [5]:
# Split Into Training & Testing Sets
train, test = train_test_split(dataset,test_size=0.30)
#train, valid = train_test_split(train, test_size=0.20)
# Write to Text Data
train.to_csv('log_reg/parkinsons_train.csv',index=False)
#train.to_csv('log_reg/valid_parkinsons.csv',index=False)
test.to_csv('log_reg/parkinsons_test.csv',index=False)
In [6]:
print 'Number of Instances Per Set'
print 'Training Set: %d'%(len(train))
#print 'Validation Set: %d'%(len(valid))
print 'Testing Set: %d'%(len(test))