In [3]:
using DataFrames
using Gadfly
using Vega

In [4]:
function mergedata()
    event = readtable("../data/event_type.csv")
    resource = readtable("../data/resource_type.csv")
    severity = readtable("../data/severity_type.csv")
    log_feature = readtable("../data/log_feature.csv")
    train = readtable("../data/train.csv")
    test = readtable("../data/test.csv")

    common = join(join(join(event,resource,on=:id,kind=:outer),severity,on=:id,kind=:outer),log_feature,on=:id,kind=:o\
uter)
    @show size(common)

    train = join(train,common,on=:id,kind=:left)
    test = join(test,common,on=:id,kind=:left)

    @show size(train),size(test)
end


WARNING: Method definition mergedata() in module Main at In[2]:2 overwritten at In[4]:2.
Out[4]:
mergedata (generic function with 1 method)

In [5]:
ev = readcsv("../data/event_type.csv")


Out[5]:
31171x2 Array{Any,2}:
      "id"  "event_type"   
  6597      "event_type 11"
  8011      "event_type 15"
  2597      "event_type 15"
  5022      "event_type 15"
  5022      "event_type 11"
  6852      "event_type 11"
  6852      "event_type 15"
  5611      "event_type 15"
 14838      "event_type 15"
 14838      "event_type 11"
  2588      "event_type 15"
  2588      "event_type 11"
     ⋮                     
  6288      "event_type 11"
 13296      "event_type 11"
  1989      "event_type 11"
 15206      "event_type 11"
 15084      "event_type 11"
  8114      "event_type 11"
  8955      "event_type 11"
  3761      "event_type 11"
  8720      "event_type 11"
  6488      "event_type 11"
   878      "event_type 11"
  4464      "event_type 11"

In [6]:
length(unique(ev[:,1])),length(unique(ev[:,2]))


Out[6]:
(18553,54)

In [7]:
re = readcsv("../data/resource_type.csv")


Out[7]:
21077x2 Array{Any,2}:
      "id"  "resource_type"  
  6597      "resource_type 8"
  8011      "resource_type 8"
  2597      "resource_type 8"
  5022      "resource_type 8"
  6852      "resource_type 8"
  5611      "resource_type 8"
 14838      "resource_type 8"
  2588      "resource_type 8"
  4848      "resource_type 8"
  6914      "resource_type 8"
  5337      "resource_type 8"
 10460      "resource_type 8"
     ⋮                       
  6288      "resource_type 8"
 13296      "resource_type 8"
  1989      "resource_type 8"
 15206      "resource_type 8"
 15084      "resource_type 8"
  8114      "resource_type 8"
  8955      "resource_type 8"
  3761      "resource_type 8"
  8720      "resource_type 8"
  6488      "resource_type 8"
   878      "resource_type 8"
  4464      "resource_type 8"

In [8]:
length(unique(re[:,1])),length(unique(re[:,2]))


Out[8]:
(18553,11)

In [9]:
se = readcsv("../data/severity_type.csv")


Out[9]:
18553x2 Array{Any,2}:
      "id"  "severity_type"  
  6597      "severity_type 2"
  8011      "severity_type 2"
  2597      "severity_type 2"
  5022      "severity_type 1"
  6852      "severity_type 1"
  5611      "severity_type 2"
 14838      "severity_type 1"
  2588      "severity_type 1"
  4848      "severity_type 1"
  6914      "severity_type 1"
  5337      "severity_type 1"
 10460      "severity_type 1"
     ⋮                       
  6288      "severity_type 1"
 13296      "severity_type 1"
  1989      "severity_type 1"
 15206      "severity_type 1"
 15084      "severity_type 1"
  8114      "severity_type 2"
  8955      "severity_type 1"
  3761      "severity_type 1"
  8720      "severity_type 1"
  6488      "severity_type 2"
   878      "severity_type 2"
  4464      "severity_type 1"

In [10]:
length(unique(se[:,1])),length(unique(se[:,2]))


Out[10]:
(18553,6)

In [11]:
log = readcsv("../data/log_feature.csv")


Out[11]:
58672x3 Array{Any,2}:
      "id"  "log_feature"    "volume"
  6597      "feature 68"    6        
  8011      "feature 68"    7        
  2597      "feature 68"    1        
  5022      "feature 172"   2        
  5022      "feature 56"    1        
  5022      "feature 193"   4        
  5022      "feature 71"    3        
  6852      "feature 201"   2        
  6852      "feature 56"    1        
  6852      "feature 80"    2        
  5611      "feature 80"    2        
 14838      "feature 203"   5        
     ⋮                               
  3761      "feature 87"    1        
  3761      "feature 209"   1        
  3761      "feature 54"    4        
  3761      "feature 170"   4        
  8720      "feature 170"   4        
  8720      "feature 155"  10        
  8720      "feature 54"    1        
  8720      "feature 209"   1        
  6488      "feature 54"    3        
   878      "feature 62"    1        
  4464      "feature 209"   1        
  4464      "feature 87"    2        

In [12]:
length(unique(log[:,1])),length(unique(log[:,2]))


Out[12]:
(18553,387)

In [13]:
tr = readcsv("../data/train.csv")


Out[13]:
7382x3 Array{Any,2}:
      "id"  "location"        "fault_severity"
 14121      "location 118"   1                
  9320      "location 91"    0                
 14394      "location 152"   1                
  8218      "location 931"   1                
 14804      "location 120"   0                
  1080      "location 664"   0                
  9731      "location 640"   0                
 15505      "location 122"   0                
  3443      "location 263"   1                
 13300      "location 613"   1                
  8976      "location 760"   1                
  7965      "location 519"   0                
     ⋮                                        
 11613      "location 478"   0                
  3450      "location 444"   1                
  4065      "location 238"   0                
  1628      "location 224"   1                
 16687      "location 1090"  1                
  6813      "location 1115"  1                
 10455      "location 1075"  2                
   870      "location 167"   0                
 18068      "location 106"   0                
 14111      "location 1086"  2                
 15189      "location 7"     0                
 17067      "location 885"   0                

In [14]:
length(unique(tr[:,1])),length(unique(tr[:,2])),length(unique(tr[:,3]))


Out[14]:
(7382,930,4)

In [15]:
train = tr[2:end,[1,3]]


Out[15]:
7381x2 Array{Any,2}:
 14121  1
  9320  0
 14394  1
  8218  1
 14804  0
  1080  0
  9731  0
 15505  0
  3443  1
 13300  1
  8976  1
  7965  0
 10342  0
     ⋮   
 11613  0
  3450  1
  4065  0
  1628  1
 16687  1
  6813  1
 10455  2
   870  0
 18068  0
 14111  2
 15189  0
 17067  0

In [16]:
[sum(train[:,2] .== i) for i=0:3]


Out[16]:
4-element Array{Any,1}:
 4784
 1871
  726
    0

In [ ]:


In [ ]:


In [ ]: