In [114]:
from google.colab import drive
drive.mount('/content/drive')
# .kaggleというフォルダをColab上に作成
!mkdir -p ~/.kaggle
# .kaggelフォルダにコピーし、権限を変更
!cp /content/drive/'My Drive'/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls /root/.kaggle
In [115]:
!pip install kaggle
In [116]:
!kaggle competitions download titanic
In [117]:
!pip install nameparser
In [0]:
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [119]:
train.info()
sex
Age
In [120]:
# Name
train["Name"].head(10)
Out[120]:
In [121]:
from nameparser import HumanName
name = HumanName("Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)")
name
Out[121]:
In [0]:
def title(name):
return HumanName(name).title
def lastname(name):
return HumanName(name).last
In [123]:
train["Title"] = train["Name"].apply(title)
train["Title"].head()
Out[123]:
In [124]:
train["Lastname"] = train["Name"].apply(lastname)
train["Lastname"].head()
Out[124]:
In [125]:
# titleのカウント
train[["Title", "Survived", "PassengerId"]].groupby(["Title", "Survived"]).count()
Out[125]:
In [0]:
title_mapper = {"Capt.": "High Office", "Col.": "High Office", "Dr.": "High Office", "Lady.": "Miss.", "Mlle.": "Miss.", "Mme.": "Mrs.", "Ms.": "Miss.", "Rev.": "High Office", "Sir.": "High Office", "the Countess. of": "Mrs.", "Master.": "Master",
"Miss.": "Miss.", "Mr.": "Mr.", "Mrs.": "Mrs"}
# Masterは男の子へのtitleなので、残す
train["Title_mapped"] = train["Title"].map(title_mapper)
In [127]:
train[["Title_mapped", "Survived"]].groupby(["Title_mapped"]).mean()
Out[127]:
In [0]:
train["Title"] = train["Title_mapped"]
In [0]:
train["Title"] = train["Name"].apply(title).map(title_mapper)
test["Title"] = test["Name"].apply(title).map(title_mapper)
In [0]:
# surnameの扱い
train["Surname"] = train["Name"].apply(lastname)
In [0]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
In [0]:
train["SurnameLength"] = train["Surname"].apply(len)
In [133]:
# Pclassの確認
train["Pclass"].describe()
Out[133]:
In [134]:
train["Sex"].describe()
Out[134]:
In [0]:
train["Male"] = pd.get_dummies(train["Sex"])["male"]
In [136]:
train[ train["Title"] == "Mr." ]["Title"]
Out[136]:
In [0]:
# Age
for title in set(train["Title"]):
train.loc[ (train["Title"] == title) & (train["Title"].isna()), "Age" ] = train[ train["Title"] == title ]["Age"].median()
In [138]:
def parse_ticket(ticket):
if not ticket.isdigit():
ret = ticket.split(" ")[-1]
if ret.isdigit():
return int(ret)
#
print(ticket)
return None
else:
return ticket
train["Ticket"] = train["Ticket"].apply(parse_ticket)
In [0]:
train.loc[train["Ticket"].isna(), "Ticket"] = train[train["Pclass"] == 3.0].mean()
In [0]:
In [0]: