In [12]:
i = 0
def clean(s):
  return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
with open("train_titanic.csv", "r") as infile:
  reader = csv.reader(infile)
  for line in reader:
        print line
        i += 1
        if (i == 2): break


['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'last_name', 'title', 'Fare2', 'FamilySize', 'Farepp', 'Deck', 'Side']
['1', '0', 'ThirdClass', 'male', '22', '1', '0', '7.25', 'S', 'Braund', 'Mr', '[  0, 10)', '2', '3.625', 'UNK', 'UNK']

In [13]:
i = 0
def clean(s):
  return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
with open("test_titanic.csv", "r") as infile:
  reader = csv.reader(infile)
  for line in reader:
        print line
        i += 1
        if (i == 2): break


['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'last_name', 'title', 'Fare2', 'FamilySize', 'Farepp', 'Deck', 'Side']
['892', 'ThirdClass', 'male', '34.5', '0', '0', '7.8292', 'Q', 'Kelly', 'Mr', '[  0, 10)', '1', '7.8292', 'UNK', 'UNK']

Convert to VOWPAL WABBIT format


In [14]:
import csv
import re
i = 0
def clean(s):
  return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
with open("train_titanic.csv", "r") as infile, open("train_titanic.vw", "wb") as outfile:
  reader = csv.reader(infile)
  for line in reader:
    i+= 1
    if i > 1:
      vw_line = ""
      if str(line[1]) == "1":
        vw_line += "1 '"
      else:
        vw_line += "-1 '"
        
      vw_line += str(line[0]) + " |f "
    
      vw_line += "passenger_class_" + str(line[2]) + " "
      vw_line += "sex_"             + str(line[3]) + " "
      vw_line += "age:"             + str(line[4]) + " "
      vw_line += "sibsp:"           + str(line[5]) + " "
      vw_line += "parch:"           + str(line[6]) + " "
      vw_line += "fare:"            + str(line[7]) + " "
      vw_line += "embarked_"        + str(line[8]) + " "
      vw_line += "last_name_"       + str(line[9]) + " "
      vw_line += "title_"           + str(line[10]) + " "
      vw_line += "fare2_"           + str(line[11]) + " "
      vw_line += "familysize:"      + str(line[12]) + " "
      vw_line += "farepp:"          + str(line[13]) + " "
      vw_line += "deck_"            + str(line[14]) + " "
      vw_line += "side_"            + str(line[15]) + " "

      outfile.write(vw_line[:-1] + "\n")
i = 0
with open("test_titanic.csv", "r") as infile, open("test_titanic.vw", "wb") as outfile:
  reader = csv.reader(infile)
  for line in reader:
    i+= 1
    if i > 1:
      vw_line = ""
      vw_line += "1 '"
      vw_line += str(line[0]) + " |f "
    
      vw_line += "passenger_class_" + str(line[1]) + " "
      vw_line += "sex_"             + str(line[2]) + " "
      vw_line += "age:"             + str(line[3]) + " "
      vw_line += "sibsp:"           + str(line[4]) + " "
      vw_line += "parch:"           + str(line[5]) + " "
      vw_line += "fare:"            + str(line[6]) + " "
      vw_line += "embarked_"        + str(line[7]) + " "
      vw_line += "last_name_"       + str(line[8]) + " "
      vw_line += "title_"           + str(line[9]) + " "
      vw_line += "fare2_"           + str(line[10]) + " "
      vw_line += "familysize:"      + str(line[11]) + " "
      vw_line += "farepp:"          + str(line[12]) + " "
      vw_line += "deck_"            + str(line[13]) + " "
      vw_line += "side_"            + str(line[14]) + " "
    
      outfile.write(vw_line[:-1] + "\n")

Create a model

vw train_titanic.vw -f model.vw --binary --passes 20 -c -q ff --adaptive --normalized --l1 0.00000001 --l2 0.0000001 -b 24

Predict

vw -d test_titanic.vw -t -i model.vw -p preds_titanic.txt

Create kaggle submission file


In [19]:
import csv
with open("preds_titanic.txt", "r") as infile, open("submission_vw.csv", "wb") as outfile:
  outfile.write("PassengerId,Survived\n")
  for line in infile.readlines():
    kaggle_line = str(line.split(" ")[1]).replace("\n","")
    if str(int(float(line.split(" ")[0]))) == "1":
      kaggle_line += ",1\n"
    else:
      kaggle_line += ",0\n"
    outfile.write(kaggle_line)

In [18]:
i=0
with open("preds_titanic.txt", "r") as infile:
    for line in infile.readlines():
        print str(line.split(" ")[1]).replace("\n","")
        print str(float(line.split(" ")[0]))
        print str(int(float(line.split(" ")[0])))
        
        print 
        i+=1
        if i>5: break


892
-0.989157
0

893
-0.15911
0

894
-1.0
-1

895
-0.988855
0

896
-0.272279
0

897
-0.869587
0


In [ ]: