Online Grafting uses:
OGFS:
We will also use from NIPS 2003 workshop:
http://clopinet.com/isabelle/Projects/NIPS2003/
All datasets except the ones from NIPS workshop is downloaded via R
library(datamicroarray)
library(tidyverse)
describe_data()
# colon
data('alon', package = 'datamicroarray')
# prostate
data('singh', package = 'datamicroarray')
# leukemia
data('golub', package = 'datamicroarray')
# lung cancer
data('gordon', package = 'datamicroarray')
colon <- alon$x %>% as.data.frame %>% mutate(y=alon$y)
prostate <- singh$x %>% as.data.frame %>% mutate(y=singh$y)
leukemia <- golub$x %>% as.data.frame %>% mutate(y=golub$y)
lung_cancer <- gordon$x %>% as.data.frame %>% mutate(y=gordon$y)
colon %>% write_csv("colon.csv")
prostate %>% write_csv("prostate.csv")
leukemia %>% write_csv("leukemia.csv")
lung_cancer %>% write_csv("lung_cancer.csv")
# UCI benchmark datasets
# Longley's Economic Regression Data
data(longley) # regression
longley %>% write_csv("longley.csv")
library(mlbench)
data("BostonHousing") # regression
BostonHousing %>% write_csv("BostonHousing.csv")
data("BreastCancer")
BreastCancer %>% write_csv("BreastCancer.csv")
data("Ionosphere")
Ionosphere %>% write_csv("Ionosphere.csv")
data("PimaIndiansDiabetes")
PimaIndiansDiabetes %>% write_csv("PimaIndiansDiabetes.csv")
library(AppliedPredictiveModeling)
data("abalone")
abalone %>% write_csv("abalone") # regression
In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.datasets import load_svmlight_file
In [2]:
train_data = glob.glob("data/fs/raw/*_train.csv")
valid_data = glob.glob("data/fs/raw/*_valid.csv")
In [3]:
train_data
Out[3]:
In [4]:
import os
In [5]:
def fix_name(fname, suffix=".csv"):
base_name = os.path.basename(fname)
return base_name.replace(".csv", suffix)
def process_file(fname):
cname = fix_name(fname)
lname = fname.replace(".csv", ".labels")
base_table = pd.read_table(fname, sep=' ', header=None)
base_table.columns = ["c{}".format(x) for x in list(base_table.columns)]
label_table = pd.read_table(lname, sep=' ', header=None)
label_table.columns = ['target']
base_table['target'] = label_table['target']
base_table.to_csv(cname, index=False)
print("Complete: {}".format(cname))
In [6]:
def process_file(fname):
cname = fix_name(fname)
lname = fname.replace(".csv", ".labels")
base_table = pd.read_table(fname, sep=' ', header=None)
base_table.columns = ["c{}".format(x) for x in list(base_table.columns)]
label_table = pd.read_table(lname, sep=' ', header=None)
label_table.columns = ['target']
#base_table['target'] = label_table['target']
base_table.to_csv(cname, index=False)
print("Complete: {}".format(cname))
In [ ]:
for t_data in train_data:
process_file(t_data)
In [7]:
for t_data in valid_data:
process_file(t_data)
In [ ]: