In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [27]:
data_dir = '../data/raw/'
data_filename = 'blood_train.csv'
df_blood = pd.read_csv(data_dir+data_filename)
df_blood.head()
Out[27]:
In [ ]:
# FILL IN TEST
# FILL IN ACTION
In [44]:
df_blood.iloc[:, 1:].describe()
Out[44]:
Insights from Summary stats table:
Variable | Value | Interpretation |
---|---|---|
Number of data points N | 576 | Not too big of a dataset |
Average number of donations in March, 2007 | 0.2396 | Whether blood was donated in March was low in general |
Max Months since 1st Donation | 98 | Earliest donation was 98 months (~8 years) ago |
Average number of donations | 5.427 | People in dataset donate an average of ~5.5 times |
In [45]:
plot_scatter = pd.scatter_matrix(df_blood.iloc[:, 1:],
figsize=(20,20))
In [29]:
import seaborn as sns
In [70]:
# sns.set_context("notebook", font_scale=1.1)
# sns.set_style("ticks")
sns.set_context("notebook", font_scale=1.5, rc={'figure.figsize': [11, 8]})
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
In [77]:
g = sns.lmplot(data=df_blood,
x='Number of Donations',
y='Months since First Donation',
hue='Made Donation in March 2007',
fit_reg=False,
palette='RdYlBu',
aspect=3/1,
scatter_kws={"marker": "D",
"s": 50})
In [ ]: