In [2]:
import pandas as pd
import numpy as np
survey_data = pd.read_csv("https://raw.githubusercontent.com/samtalasila/e-bike-survey-response-results/master/E-Bike_Survey_Responses.csv")
print(survey_data.shape)
In [13]:
# take a summary of how many missing values appear in the rows versus columns
missing_values_rows = survey_data.isna().sum(axis=1)
missing_values_cols = survey_data.isna().sum(axis=0)
missing_values_total_sum = survey_data.isna().sum()
In [30]:
# plot the number of missing data by column
%matplotlib inline
import seaborn as sns
colnames = [name for name, value in missing_values_cols.items()]
values = [value for name, value in missing_values_cols.items()]
p = sns.barplot([a for a in range(len(colnames))], values)
p.set_title("Number of missing data by column")
print("The sixth feature is missing much moreso than usual: ", colnames[5])
print("Minor missing effects by column also observed for features: ")
print(colnames[2],"\n",colnames[4],"\n",colnames[6])
In [26]:
# for those rows with any missing data, plot the numbe of missing data by row
missing_values_rows_nonzero = missing_values_rows[missing_values_rows > 0]
colnames = [name for name, value in missing_values_rows_nonzero.items()]
values = [value for name, value in missing_values_rows_nonzero.items()]
p = sns.barplot([a for a in range(len(colnames))], values)
p.set_title("Number of missing data by row with at least one non-zero value")
Out[26]: