In [114]:
from google.colab import drive
drive.mount('/content/drive')

# .kaggleというフォルダをColab上に作成
!mkdir -p ~/.kaggle

# .kaggelフォルダにコピーし、権限を変更
!cp /content/drive/'My Drive'/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!ls /root/.kaggle


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
kaggle.json

In [115]:
!pip install kaggle


Requirement already satisfied: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.2)
Requirement already satisfied: urllib3<1.23.0,>=1.15 in /usr/local/lib/python3.6/dist-packages (from kaggle) (1.22)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/dist-packages (from kaggle) (1.11.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle) (2018.11.29)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kaggle) (2.5.3)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kaggle) (2.18.4)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from kaggle) (4.28.1)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle) (2.0.1)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle) (2.6)
Requirement already satisfied: Unidecode>=0.04.16 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle) (1.0.23)

In [116]:
!kaggle competitions download titanic


train.csv: Skipping, found more recently modified local copy (use --force to force download)
test.csv: Skipping, found more recently modified local copy (use --force to force download)
gender_submission.csv: Skipping, found more recently modified local copy (use --force to force download)

In [117]:
!pip install nameparser


Requirement already satisfied: nameparser in /usr/local/lib/python3.6/dist-packages (1.0.2)

In [0]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [119]:
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

基本方針

カテゴリカル変数の扱い

  • Name
    • Mrなどの呼び名とSurnameに変換して扱う。
    • Mrはカテゴリカルにした上で、数字に割り振る
    • Surnameの名前を出す。
  • Pclass
    • Ticket Classで Ordinal変数
  • sex

    • 性別, 1,0に変換
  • Age

    • 年齢、
    • NAはMrなどの呼び名のmedianをとる
  • SibSp
    • 兄弟や伴侶の数
  • Parch
    • 親または子供の数
  • Ticket
    • チケットナンバー
  • Fare
  • Cabin
    • 乗っている場所っぽい
  • Embarked
    • 登場場所

分析アルゴリズム

  • Random Forest Regressor
  • Logisitic Regression
  • Deep Forest
  • LightGBM
  • CatBoost
  • Support Vector Machine

In [120]:
# Name
train["Name"].head(10)


Out[120]:
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

In [121]:
from nameparser import HumanName

name = HumanName("Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)")
name


Out[121]:
<HumanName : [
	title: 'Mrs.' 
	first: 'Oscar' 
	middle: 'W' 
	last: 'Johnson' 
	suffix: ''
	nickname: 'Elisabeth Vilhelmina Berg'
]>

In [0]:
def title(name):
  return HumanName(name).title

def lastname(name):
  return HumanName(name).last

In [123]:
train["Title"] = train["Name"].apply(title)
train["Title"].head()


Out[123]:
0      Mr.
1     Mrs.
2    Miss.
3     Mrs.
4      Mr.
Name: Title, dtype: object

In [124]:
train["Lastname"] = train["Name"].apply(lastname)
train["Lastname"].head()


Out[124]:
0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Lastname, dtype: object

In [125]:
# titleのカウント
train[["Title", "Survived", "PassengerId"]].groupby(["Title", "Survived"]).count()


Out[125]:
PassengerId
Title Survived
0 3
1 1
Capt. 0 1
Col. 0 1
1 1
Dr. 0 4
1 3
Lady. 1 1
Master. 0 17
1 23
Miss. 0 55
1 127
Mlle. 1 2
Mme. 1 1
Mr. 0 436
1 81
Mrs. 0 26
1 99
Ms. 1 1
Rev. 0 6
Sir. 1 1
the Countess. of 1 1

In [0]:
title_mapper = {"Capt.": "High Office", "Col.": "High Office", "Dr.": "High Office", "Lady.": "Miss.", "Mlle.": "Miss.", "Mme.": "Mrs.", "Ms.": "Miss.", "Rev.": "High Office", "Sir.": "High Office", "the Countess. of": "Mrs.", "Master.": "Master",
               "Miss.": "Miss.", "Mr.": "Mr.", "Mrs.": "Mrs"}
# Masterは男の子へのtitleなので、残す
train["Title_mapped"] = train["Title"].map(title_mapper)

In [127]:
train[["Title_mapped", "Survived"]].groupby(["Title_mapped"]).mean()


Out[127]:
Survived
Title_mapped
High Office 0.294118
Master 0.575000
Miss. 0.704301
Mr. 0.156673
Mrs 0.792000
Mrs. 1.000000

In [0]:
train["Title"] = train["Title_mapped"]

In [0]:
train["Title"] = train["Name"].apply(title).map(title_mapper)
test["Title"] = test["Name"].apply(title).map(title_mapper)

In [0]:
# surnameの扱い
train["Surname"] = train["Name"].apply(lastname)

In [0]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1

In [0]:
train["SurnameLength"] = train["Surname"].apply(len)

In [133]:
# Pclassの確認
train["Pclass"].describe()


Out[133]:
count    891.000000
mean       2.308642
std        0.836071
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: Pclass, dtype: float64

In [134]:
train["Sex"].describe()


Out[134]:
count      891
unique       2
top       male
freq       577
Name: Sex, dtype: object

In [0]:
train["Male"] = pd.get_dummies(train["Sex"])["male"]

In [136]:
train[ train["Title"] == "Mr." ]["Title"]


Out[136]:
0      Mr.
4      Mr.
5      Mr.
6      Mr.
12     Mr.
13     Mr.
17     Mr.
20     Mr.
21     Mr.
23     Mr.
26     Mr.
27     Mr.
29     Mr.
33     Mr.
34     Mr.
35     Mr.
36     Mr.
37     Mr.
42     Mr.
45     Mr.
46     Mr.
48     Mr.
51     Mr.
54     Mr.
55     Mr.
57     Mr.
60     Mr.
62     Mr.
64     Mr.
67     Mr.
      ... 
836    Mr.
837    Mr.
838    Mr.
839    Mr.
840    Mr.
841    Mr.
843    Mr.
844    Mr.
845    Mr.
846    Mr.
847    Mr.
851    Mr.
857    Mr.
859    Mr.
860    Mr.
861    Mr.
864    Mr.
867    Mr.
868    Mr.
870    Mr.
872    Mr.
873    Mr.
876    Mr.
877    Mr.
878    Mr.
881    Mr.
883    Mr.
884    Mr.
889    Mr.
890    Mr.
Name: Title, Length: 517, dtype: object

In [0]:
# Age
for title in set(train["Title"]):
  train.loc[ (train["Title"] == title) & (train["Title"].isna()), "Age" ] = train[ train["Title"] == title ]["Age"].median()

In [138]:
def parse_ticket(ticket):
  if not ticket.isdigit():
    ret = ticket.split(" ")[-1]
    if ret.isdigit():
      return int(ret)
    # 
    print(ticket)
    return None
  else:
    return ticket

train["Ticket"] = train["Ticket"].apply(parse_ticket)


LINE
LINE
LINE
LINE

In [0]:
train.loc[train["Ticket"].isna(), "Ticket"] = train[train["Pclass"] == 3.0].mean()

In [0]:


In [0]: