All datasets except the ones from NIPS workshop is downloaded via R

library(datamicroarray)
library(tidyverse)
describe_data()

# colon
data('alon', package = 'datamicroarray')

# prostate
data('singh', package = 'datamicroarray')

# leukemia
data('golub', package = 'datamicroarray')

# lung cancer
data('gordon', package = 'datamicroarray')

colon <- alon$x %>% as.data.frame %>% mutate(y=alon$y)
prostate <- singh$x %>% as.data.frame %>% mutate(y=singh$y)
leukemia <- golub$x %>% as.data.frame %>% mutate(y=golub$y)
lung_cancer <- gordon$x %>% as.data.frame %>% mutate(y=gordon$y)

colon %>% write_csv("colon.csv")
prostate %>% write_csv("prostate.csv")
leukemia %>% write_csv("leukemia.csv")
lung_cancer %>% write_csv("lung_cancer.csv")

# UCI benchmark datasets

# Longley's Economic Regression Data
data(longley) # regression
longley %>% write_csv("longley.csv")

library(mlbench)

data("BostonHousing") # regression
BostonHousing %>% write_csv("BostonHousing.csv")
data("BreastCancer")
BreastCancer %>% write_csv("BreastCancer.csv")
data("Ionosphere")
Ionosphere %>% write_csv("Ionosphere.csv")
data("PimaIndiansDiabetes")
PimaIndiansDiabetes %>% write_csv("PimaIndiansDiabetes.csv")

library(AppliedPredictiveModeling)
data("abalone")
abalone %>% write_csv("abalone") # regression

In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.datasets import load_svmlight_file

In [2]:
train_data = glob.glob("data/fs/raw/*_train.csv")
valid_data = glob.glob("data/fs/raw/*_valid.csv")

In [3]:
train_data


Out[3]:
['data/fs/raw\\arcene_train.csv',
 'data/fs/raw\\dexter_train.csv',
 'data/fs/raw\\dorothea_train.csv',
 'data/fs/raw\\gisette_train.csv',
 'data/fs/raw\\madelon_train.csv']

In [4]:
import os

In [5]:
def fix_name(fname, suffix=".csv"):
    base_name = os.path.basename(fname)
    return base_name.replace(".csv", suffix)

def process_file(fname):
    cname = fix_name(fname)
    lname = fname.replace(".csv", ".labels")
    base_table = pd.read_table(fname, sep=' ', header=None)
    base_table.columns = ["c{}".format(x) for x in list(base_table.columns)]
    
    label_table = pd.read_table(lname, sep=' ', header=None)
    label_table.columns = ['target']
    base_table['target'] = label_table['target']
    
    base_table.to_csv(cname, index=False)
    print("Complete: {}".format(cname))

In [6]:
def process_file(fname):
    cname = fix_name(fname)
    lname = fname.replace(".csv", ".labels")
    base_table = pd.read_table(fname, sep=' ', header=None)
    base_table.columns = ["c{}".format(x) for x in list(base_table.columns)]
    
    label_table = pd.read_table(lname, sep=' ', header=None)
    label_table.columns = ['target']
    #base_table['target'] = label_table['target']
    
    base_table.to_csv(cname, index=False)
    print("Complete: {}".format(cname))

In [ ]:
for t_data in train_data:
    process_file(t_data)


Complete: arcene_train.csv
Complete: dexter_train.csv
Complete: dorothea_train.csv

In [7]:
for t_data in valid_data:
    process_file(t_data)


Complete: arcene_valid.csv
Complete: dexter_valid.csv
Complete: dorothea_valid.csv
Complete: gisette_valid.csv
Complete: madelon_valid.csv

In [ ]: