In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np
from mclearn.tools import fetch_data, download_data
In [2]:
uci_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/'
The standard datasets are taken from the UCI Machine Learning Repository. For each dataset, header rows are added manually.
Ionosphere radar data, where 'good' is the positive label.
In [3]:
url = uci_url + 'ionosphere/ionosphere.data'
dest = 'data/ionosphere.csv'
header = ','.join("x{0}".format(i) for i in np.arange(1, 35)) + ',target'
data = fetch_data(url, dest, header, label={'b': 0, 'g': 1})
data.head()
Out[3]:
Pima Indians diabetes.
In [4]:
url = uci_url + 'pima-indians-diabetes/pima-indians-diabetes.data'
dest = 'data/pima.csv'
header = 'preg,glucose,diastolic,skin,insulin,bmi,pedi,age,target'
remove_missing = lambda df: df[df['diastolic'] > 0]
data = fetch_data(url, dest, header, process_fn=remove_missing)
data.head()
Out[4]:
Sonar data, where we want to discriminate between sonar signals bounced off a metal cylinder and those bounced off a roughly cylindrical rock. Metal cylinder is considered as the positive label.
In [5]:
url = uci_url + 'undocumented/connectionist-bench/sonar/sonar.all-data'
dest = 'data/sonar.csv'
header = ','.join('e{0}'.format(i) for i in np.arange(1, 61)) + ',target'
data = fetch_data(url, dest, header, label={'R': 0, 'M': 1})
data.head()
Out[5]:
Prognostic Wisconsin breast cancer. Recurrence is the positive label.
In [6]:
url = uci_url + 'breast-cancer-wisconsin/wpbc.data'
dest = 'data/wpbc.csv'
header = 'id,target,time,rad1,text1,peri1,area1,smooth1,compact1,concave1,' \
'conpt1,sym1,fract1,rad2,text2,peri2,area2,smooth2,compact2,concave2,' \
'conpt2,sym2,fract2,rad3,text3,peri3,area3,smooth3,compact3,concave3,' \
'conpt3,sym3,fract3,tumor,lymph'
data = fetch_data(url, dest, header, label={'N': 0, 'R': 1})
data.head()
Out[6]:
MAGIC gamma telescope.
In [3]:
url = uci_url + 'magic/magic04.data'
dest = 'data/magic.csv'
header = 'length,width,size,conc,conc1,asym,m3long,m3trans,alpha,dist,target'
data = fetch_data(url, dest, header, label={'g': 1, 'h': 0})
data.head()
Out[3]:
MiniBooNE particle identification
In [7]:
url = uci_url + '00199/MiniBooNE_PID.txt'
dest = 'data/miniboone.csv'
download_data(url, dest)
header = ['e{0}'.format(i) for i in np.arange(1, 51)]
data = pd.read_csv(dest, sep='\s+', skiprows=1,
header=None, names=header, na_values=[-999])
data['target'] = 1
data.loc[36499:, 'target'] = 0
data = data[['target']].join(data.drop('target', axis=1))
data.dropna(axis=0, how='any', inplace=True)
data.to_csv(dest, index=False, float_format='%.12g')
data.head()
Out[7]:
Classic iris dataset from Fisher with three classes.
In [7]:
url = uci_url + 'iris/iris.data'
dest = 'data/iris.csv'
header = 'sepal_l,sepal_w,petal_l,petal_w,target'
data = fetch_data(url, dest, header)
data.head()
Out[7]:
Glass identification, seven classes.
In [8]:
url = uci_url + 'glass/glass.data'
dest = 'data/glass.csv'
header = 'ri,na,mg,al,si,k,ca,ba,fe,target'
data = fetch_data(url, dest, header)
data.head()
Out[8]:
Classifying a given sihouette as one of four types of vehicle.
In [9]:
names = ['xaa', 'xab', 'xac', 'xad', 'xae', 'xaf', 'xag', 'xah', 'xai']
urls = ['{0}statlog/vehicle/{1}.dat'.format(uci_url, x) for x in names]
dest = 'data/vehicle.csv'
header = 'compact circ dcirc rrat prar mlen scat elon prr mlenr svarmaj ' \
'svarmin gy skewmaj skewmin kurtmin kurtmaj hol target placeholder'
data = fetch_data(urls, dest, header, sep=' ')
data.head()
Out[9]:
Using chemical analysis to determine the origin of wines.
In [10]:
url = uci_url + 'wine/wine.data'
dest = 'data/wine.csv'
header = 'target,alcohol,malic,ash,alcash,mg,phenols,' \
'flav,nonflav,proan,color,hue,od280,proline'
data = fetch_data(url, dest, header)
data.head()
Out[10]:
Page blocks
In [38]:
url = uci_url + 'page-blocks/page-blocks.data.Z'
zip_dest = 'data/pageblocks.csv.Z'
dest = 'data/pageblocks.csv'
download_data(url, zip_dest)
os.system('uncompress {filename}'.format(filename=dest))
header = ['height', 'length', 'area', 'eccen', 'pblack', 'pand',
'meantr', 'blackpix', 'blackand', 'wbtrans', 'target']
data = pd.read_csv(dest, sep='\s+', header=None, names=header)
data = data[['target']].join(data.drop('target', axis=1))
data.dropna(axis=0, how='any', inplace=True)
data.to_csv(dest, index=False, float_format='%.12g')
data.head()
Out[38]:
Semeion handwritten digit dataset
In [42]:
url = uci_url + 'semeion/semeion.data'
dest = 'data/semeion.data'
download_data(url, dest, overwrite=False)
matrix = []
with open(dest) as f:
for i, line in enumerate(f):
row = line.strip().split(' ')
target = row[-10:].index('1')
matrix.append(row[:-10] + [target])
columns = list(np.arange(256)) + ['target']
data = pd.DataFrame(matrix, columns=columns)
data = data[['target']].join(data.drop('target', axis=1))
data = data.astype(float).astype(int)
data.to_csv('data/semeion.csv', index=False)
data.head()
Out[42]:
Yeast datasset
In [16]:
url = uci_url + 'yeast/yeast.data'
dest = 'data/yeast.csv'
header = 'id mcg gvh alm mit erl pox vac nuc target'
data = fetch_data(url, dest, header, sep=r'\s+')
data.head()
Out[16]: