This notebook follows exactly the tutorial from Kaggle: data analysis framework from Kaggle. Thanks to the author LD Freeman for creating such a great tutorial! My goal is to create an Apache Spark version using the same framework.
In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)
The datasets can be found here: https://www.kaggle.com/c/titanic/data. It is also available in this github repository:
In [3]:
# load packages
import sys
print('Python version: {}'. format(sys.version))
import pandas as pd
print('Python version: {}'. format(pd.__version__))
import matplotlib
print('matplotlib version: {}'. format(matplotlib.__version__))
import numpy as np
print('numpy version: {}'. format(np.__version__))
import scipy as sp
print('scipy version: {}'. format(sp.__version__))
import IPython
from IPython import display # pretty printing of dataframe in Jupyter notebook
print('IPython version: {}'. format(IPython.__version__))
import pyspark
print('Apache Spark Pyspark version: {}'. format(pyspark.__version__)) # pyspark version
# misc libraries
import random
import time
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
In [4]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import OneVsRest
In [5]:
from subprocess import check_output
print('-'*10, 'datasets', '-'*10)
print(check_output(['ls', 'data/titanic']).decode('utf8'))
In [6]:
# import data
# we will split the train data into train and test data in future sections
data_raw = spark.read.csv('data/titanic/train.csv', inferSchema=True, header=True)
# the test file provided is for validation of final model.
data_val = spark.read.csv('data/titanic/test.csv', inferSchema=True, header=True)
# preview the data
# data type
print('-'*10, 'data types', '-'*10)
pd.DataFrame(data_raw.dtypes)
Out[6]:
In [7]:
# data summary
print('-'*10, 'data summary', '-'*10)
data_raw.describe().toPandas()
Out[7]:
In [8]:
# view a small subset of the data
print('-'*10, 'randomely sample 1% data to view', '-'*10)
data_raw.randomSplit([0.01, 0.99])[0].toPandas()
Out[8]:
In [9]:
# we first check which values are NULL values for each column
# then we convert the boolean values to int (0 and 1), then we can count how many 1's exist in each column.
print('-'*25)
print('0: is not NULL')
print('1: is NULL')
print('-'*25)
print(' '*25)
# we build column strings and then use eval() to convert strings to column expressions.
data_raw.select([eval('data_raw.' + x + '.isNull().cast("int").alias("' + x + '")') for x in data_raw.columns]).show(n=10)
In [10]:
print('Train columns with null values:')
print('-'*25)
data_raw.select([eval('data_raw.' + x + '.isNull().cast("int").alias("' + x + '")') for x in data_raw.columns]).\
groupBy().sum().toPandas()
Out[10]:
In [11]:
print('Test columns with null values:')
print('-'*25)
data_val.select([eval('data_val.' + x + '.isNull().cast("int").alias("' + x + '")') for x in data_val.columns]).\
groupBy().sum().toPandas()
Out[11]:
In [23]:
# COMPLETE: complete or delete missing values in train and test/validation dataset.
# complete missing age with median
# complete missing embarked with mode
# complete missing fare with median
In [24]:
data_raw.select('Age')
Out[24]:
In [ ]: