In [1]:
# create entry points to spark
try:
sc.stop()
except:
pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)
Data source: https://www.kaggle.com/c/titanic/data
In [5]:
titanic = spark.read.csv('../../data/kaggle-titanic-train.csv', header=True, inferSchema=True)
titanic.show(5)
In [6]:
titanic.printSchema()
In [7]:
len(titanic.columns)
Out[7]:
In [8]:
titanic.count()
Out[8]:
In [9]:
def describe_columns(df):
for i in df.columns:
print('Column: ' + i)
titanic.select(i).describe().show()
In [10]:
describe_columns(titanic)
In [11]:
def find_missing_values_columns(df):
nrow = df.count()
for v in df.columns:
summary_df = df.select(v).describe()
v_count = int(summary_df.collect()[0][v])
if v_count < nrow:
missing_percentage = (1 - v_count/nrow) * 100
print("Total observations: " + str(nrow) + "\n"
"Total observations of " + v + ": " + str(v_count) + "\n"
"Percentage of missing values: " + str(missing_percentage) + "%" + "\n"
"----------------------------")
In [12]:
find_missing_values_columns(titanic)
In [ ]: