In [2]:
%matplotlib inline
In [12]:
ls data/
In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('data/iris.data', header=None, sep=',', names=['A', 'B', "C", "D", "label"])
label = df['label']
dataset = df[df.columns[0:4]]
label.head()
dataset.head()
Out[60]:
In [74]:
dfmax = dataset.max()
print dfmax
dfmin = dataset.min()
print dfmin
dflen = dfmax[0:4] - dfmin[0:4]
print dflen
dataset.head()
(dfmax-dfmin).T
Out[74]:
In [83]:
scaledDataSet = (dataset -(dfmax+dfmin)/2) / (dflen / 2)
In [85]:
scaledDataSet.head()
Out[85]:
In [91]:
df[df.columns[0:4]] = scaledDataSet
df.shape
Out[91]:
In [170]:
tmp = label.drop_duplicates()
tmp.values
tmp.values.searchsorted("Iris-virginica")
Out[170]:
In [184]:
df['label'] = tmp.values.searchsorted(label)+1
Out[184]:
In [185]:
df.to_csv('data/test.tmp')
In [ ]:
In [ ]:
In [ ]: