In [1]:
run(`wget http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip`)
run(`unzip bank.zip -d data`)
In [2]:
using DataFrames
table = readtable("data/bank.csv", separator=';')
Out[2]:
DataFrames
をより便利に使うために、DataFramesMetaを使いましょう。
これを使うと、よりPandasっぽい(あるいはSQL likeな)便利な書き方ができるようになります。
In [3]:
Pkg.add("DataFramesMeta")
In [4]:
using DataFramesMeta
In [5]:
x_thread = @linq table |>
where(:age .> 60) |>
where(:housing .== "yes") |>
orderby(:job)
Out[5]:
In [6]:
age_balance = @linq table |>
where(:age .> 60) |>
where(:housing .== "yes") |>
select(:age, :balance)
Out[6]:
In [7]:
using Gadfly
In [8]:
plot(age_balance, x=:age, y=:balance, Geom.point)
Out[8]:
In [9]:
bank = readtable("data/bank-full.csv", separator = ';')
Out[9]:
In [10]:
bank[:y] = [y == "yes" ? 1.0 : -1.0 for y in bank[:y]]
Out[10]:
In [11]:
categorical_keys = [:job, :marital, :education, :default, :housing, :loan, :contact, :month, :poutcome]
Out[11]:
In [12]:
numerical_keys = setdiff(names(bank), [categorical_keys, :y])
Out[12]:
In [13]:
bank_normalized = deepcopy(bank)
bank_normalized[numerical_keys]
Out[13]:
In [14]:
for key in numerical_keys
bank_normalized[key] = (bank[:, key] - mean(bank[key])) / std(bank[key])
end
In [15]:
bank_normalized[numerical_keys]
Out[15]:
ここでは、カテゴリ変数をダミー変数に変換します。
カテゴリ変数はEnumなどで表される変数です。
JuliaのDataFrameはこれをよしなに扱ってくれないので、自分で変換をするコードを書きます。
julia-users MLのコードを使わせてもらうことにしましょう
https://groups.google.com/d/msg/julia-users/7-Vtpi8w4YI/KvMlKAZSwDkJ
In [16]:
function getdummy{R}(df::DataFrame, cname::Symbol, ::Type{R})
darr = df[cname]
vals = sort(levels(darr))[2:end]
namedict = Dict(vals, 1:length(vals))
arr = zeros(R, length(darr), length(namedict))
for i=1:length(darr)
if haskey(namedict, darr[i])
arr[i, namedict[darr[i]]] = 1
end
end
newdf = convert(DataFrame, arr)
names!(newdf, [symbol("$(cname)_$k") for k in vals])
return newdf
end
function convertdummy{R}(df::DataFrame, cnames::Array{Symbol}, ::Type{R})
# consider every variable from cnames as categorical
# and convert them into set of dummy variables,
# return new dataframe
newdf = DataFrame()
for cname in names(df)
if !in(cname, cnames)
newdf[cname] = df[cname]
else
dummydf = getdummy(df, cname, R)
for dummyname in names(dummydf)
newdf[dummyname] = dummydf[dummyname]
end
end
end
return newdf
end
convertdummy(df::DataFrame, cnames::Array{Symbol}) = convertdummy(df, cnames, Int32)
Out[16]:
In [17]:
bank_dummy = convertdummy(bank_normalized[:, 1:16], categorical_keys)
Out[17]:
In [18]:
X = convert(Array, bank_dummy[:, 1:42])'
Out[18]:
In [19]:
Y = convert(Array, bank_normalized[:y])
Out[19]:
In [20]:
attribute_num, sample_num = size(X)
Out[20]:
In [21]:
train_flags = randbool(sample_num)
Out[21]:
In [22]:
using SVM
model_svm = svm(X[:, train_flags], Y[train_flags])
Out[22]:
In [23]:
accuracy = countnz(predict(model_svm, X[:, ~train_flags]) .== Y[~train_flags]) / countnz(~train_flags)
Out[23]: