In [3]:

    
# Load the relevant libraries
import pandas as pd # for data import and dissection
import seaborn as sns # For data visulaization
sns.set(style="whitegrid",color_codes=True)
# Reading the data where low_memory=False increases the program efficiency
data= pd.read_csv("data-taarifa.csv", low_memory=False)
# Making two copies of the data
sub1=data.copy()
sub2=data.copy()



In [5]:

    
# Checking the data type of the variables
sub1.dtypes









    Out[5]:





id                         int64
amount_tsh               float64
date_recorded             object
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_type              object
water_quality             object
quality_group             object
quantity                  object
quantity_group            object
source                    object
source_type               object
source_class              object
waterpoint_type           object
waterpoint_type_group     object
dtype: object



In [6]:

    
# A quick look at the data
sub1.head(6)









    Out[6]:






  
    
      
      id
      amount_tsh
      date_recorded
      funder
      gps_height
      installer
      longitude
      latitude
      wpt_name
      num_private
      ...
      payment_type
      water_quality
      quality_group
      quantity
      quantity_group
      source
      source_type
      source_class
      waterpoint_type
      waterpoint_type_group
    
  
  
    
      0
      69572
      6000.0
      14-03-11
      Roman
      1390
      Roman
      34.938093
      -9.856322
      none
      0
      ...
      annually
      soft
      good
      enough
      enough
      spring
      spring
      groundwater
      communal standpipe
      communal standpipe
    
    
      1
      8776
      0.0
      06-03-13
      Grumeti
      1399
      GRUMETI
      34.698766
      -2.147466
      Zahanati
      0
      ...
      never pay
      soft
      good
      insufficient
      insufficient
      rainwater harvesting
      rainwater harvesting
      surface
      communal standpipe
      communal standpipe
    
    
      2
      34310
      25.0
      25-02-13
      Lottery Club
      686
      World vision
      37.460664
      -3.821329
      Kwa Mahundi
      0
      ...
      per bucket
      soft
      good
      enough
      enough
      dam
      dam
      surface
      communal standpipe multiple
      communal standpipe
    
    
      3
      67743
      0.0
      28-01-13
      Unicef
      263
      UNICEF
      38.486161
      -11.155298
      Zahanati Ya Nanyumbu
      0
      ...
      never pay
      soft
      good
      dry
      dry
      machine dbh
      borehole
      groundwater
      communal standpipe multiple
      communal standpipe
    
    
      4
      19728
      0.0
      13-07-11
      Action In A
      0
      Artisan
      31.130847
      -1.825359
      Shuleni
      0
      ...
      never pay
      soft
      good
      seasonal
      seasonal
      rainwater harvesting
      rainwater harvesting
      surface
      communal standpipe
      communal standpipe
    
    
      5
      9944
      20.0
      13-03-11
      Mkinga Distric Coun
      0
      DWE
      39.172796
      -4.765587
      Tajiri
      0
      ...
      per bucket
      salty
      salty
      enough
      enough
      other
      other
      unknown
      communal standpipe multiple
      communal standpipe
    
  

6 rows × 40 columns

Distribution Analysis of the data

Now that we have familarity with the basic characterstics, lets look at the distribution of various variables starting with the continuous variable

Distribution analysis of continuous variable using the describe()



In [7]:

    
sub1.describe()









    Out[7]:






  
    
      
      id
      amount_tsh
      gps_height
      longitude
      latitude
      num_private
      region_code
      district_code
      population
      construction_year
    
  
  
    
      count
      59400.000000
      59400.000000
      59400.000000
      59400.000000
      5.940000e+04
      59400.000000
      59400.000000
      59400.000000
      59400.000000
      59400.000000
    
    
      mean
      37115.131768
      317.650385
      668.297239
      34.077427
      -5.706033e+00
      0.474141
      15.297003
      5.629747
      179.909983
      1300.652475
    
    
      std
      21453.128371
      2997.574558
      693.116350
      6.567432
      2.946019e+00
      12.236230
      17.587406
      9.633649
      471.482176
      951.620547
    
    
      min
      0.000000
      0.000000
      -90.000000
      0.000000
      -1.164944e+01
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      18519.750000
      0.000000
      0.000000
      33.090347
      -8.540621e+00
      0.000000
      5.000000
      2.000000
      0.000000
      0.000000
    
    
      50%
      37061.500000
      0.000000
      369.000000
      34.908743
      -5.021597e+00
      0.000000
      12.000000
      3.000000
      25.000000
      1986.000000
    
    
      75%
      55656.500000
      20.000000
      1319.250000
      37.178387
      -3.326156e+00
      0.000000
      17.000000
      5.000000
      215.000000
      2004.000000
    
    
      max
      74247.000000
      350000.000000
      2770.000000
      40.345193
      -2.000000e-08
      1776.000000
      99.000000
      80.000000
      30500.000000
      2013.000000

Distribution analysis of categorical variable by using the value_counts()



In [8]:

    
sub1['extraction_type_class'].value_counts()









    Out[8]:





gravity         26780
handpump        16456
other            6430
submersible      6179
motorpump        2987
rope pump         451
wind-powered      117
Name: extraction_type_class, dtype: int64



In [9]:

    
sub1['payment_type'].value_counts()









    Out[9]:





never pay     25348
per bucket     8985
monthly        8300
unknown        8157
on failure     3914
annually       3642
other          1054
Name: payment_type, dtype: int64



In [10]:

    
sub1['quality_group'].value_counts()









    Out[10]:





good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64



In [11]:

    
sub1['quantity_group'].value_counts()









    Out[11]:





enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64



In [12]:

    
sub1['waterpoint_type_group'].value_counts()









    Out[12]:





communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64



In [13]:

    
sub1['water_quality'].value_counts()









    Out[13]:





soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64



In [14]:

    
sub1['source_type'].value_counts()









    Out[14]:





spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

Converting the categorical predictor to numeric using Label Encoder from the sklearn library



In [15]:

    
from sklearn.preprocessing import LabelEncoder
var_mod = ['extraction_type_class','payment_type','quality_group','quantity_group','waterpoint_type_group','water_quality','source_type']
le = LabelEncoder()
for i in var_mod:
    sub1[i] = le.fit_transform(sub1[i])
sub1.dtypes









    Out[15]:





id                         int64
amount_tsh               float64
date_recorded             object
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class      int64
management                object
management_group          object
payment                   object
payment_type               int64
water_quality              int64
quality_group              int64
quantity                  object
quantity_group             int64
source                    object
source_type                int64
source_class              object
waterpoint_type           object
waterpoint_type_group      int64
dtype: object

Data Visualization of various predictors for this study



In [17]:

    
%matplotlib inline
sub1['permit'].hist(bins=10)









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0xbd77a20>



In [18]:

    
t1=pd.crosstab(sub2['water_quality'],sub2['source_type'])
t1.plot(kind='hist', stacked=True, grid=False, legend=True, title="Water quality based on type of water source")









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0xbf7aef0>



In [60]:

    
t2=pd.crosstab(sub2['source_type'],sub2['payment_type'])
t2.plot(kind='hist', stacked=True, grid=False, legend=True, title="Water quality and types of payment")









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x18644438>



In [20]:

    
%matplotlib inline
sns.violinplot(x=sub1.extraction_type_class, y=sub1.source_type)









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0xc229b70>



In [21]:

    
%matplotlib inline
sns.pointplot(x="extraction_type_class", y="water_quality", data=sub1)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x12c22390>



In [41]:

    
%matplotlib inline
sns.violinplot(x="waterpoint_type_group", y="source_type", hue="water_quality", data=sub1)









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x1667edd8>

Distributions of observations within categories

At a certain point, the categorical scatterplot approach becomes limited in the information it can provide about the distribution of values within each category. There are several ways to summarize this information in ways that facilitate easy comparisons across the category levels.

Boxplots

This kind of plot shows the three quartile values of the distribution along with extreme values. The “whiskers” extend to points that lie within 1.5 IQRs of the lower and upper quartile, and then observations that fall outside this range are displayed independently. Importantly, this means that each value in the boxplot corresponds to an actual observation in the data.



In [27]:

    
sns.boxplot(x="source_type", y="payment_type",data=sub1)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x145d5e48>

Violinplots

A different approach is a violinplot(), which combines a boxplot with the kernel density estimation procedure.



In [36]:

    
sns.violinplot(x="extraction_type_class", y="source_type", scale="count", data=sub1)









    Out[36]:





<matplotlib.axes._subplots.AxesSubplot at 0x15b92ba8>



In [39]:

    
sns.violinplot(x="extraction_type_class", y="source_type", split="True", data=sub1)









    Out[39]:





<matplotlib.axes._subplots.AxesSubplot at 0x15f23b00>

Statistical distribution within categories

Often, rather than showing the distribution within each category, you might want to show the central tendency of the values.

Barplots



In [43]:

    
sns.barplot(x="extraction_type_class", y="source_type", data=sub1)









    Out[43]:





<matplotlib.axes._subplots.AxesSubplot at 0x16aae3c8>



In [48]:

    
sns.barplot(x="quantity_group", y="quality_group", data=sub1)









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x1643de80>

A special case for the bar plot is when you want to show the number of observations in each category rather than computing a statistic for a second variable. This is similar to a histogram over a categorical, rather than quantitative, variable. In seaborn, it’s easy to do so with the countplot() function



In [61]:

    
sns.countplot(x="quality_group", data=data)









    Out[61]:





<matplotlib.axes._subplots.AxesSubplot at 0x18a54d30>



In [55]:

    
sns.countplot(y="payment_type", data=sub1)









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x17dccc50>

Point plots

An alternative style for visualizing the same information is offered by the pointplot() function. This function also encodes the value of the estimate with height on the other axis, but rather than show a full bar it just plots the point estimate and confidence interval. Additionally, pointplot connects points from the same hue category. This makes it easy to see how the main relationship is changing as a function of a second variable



In [56]:

    
sns.pointplot(x="extraction_type_class", y="waterpoint_type_group", data=sub1)









    Out[56]:





<matplotlib.axes._subplots.AxesSubplot at 0x18019e10>



In [58]:

    
sns.pointplot(x="extraction_type_class", y="water_quality", markers=["^", "o"], linestyles=["-", "--"],data=sub1)









    Out[58]:





<matplotlib.axes._subplots.AxesSubplot at 0x16b79eb8>



In [ ]:

	id	amount_tsh	date_recorded	funder	gps_height	installer	longitude	latitude	wpt_name	...	payment_type	water_quality	quality_group	quantity	quantity_group	source	source_type	source_class	waterpoint_type	waterpoint_type_group
0	69572	6000.0	14-03-11	Roman	1390	Roman	34.938093	-9.856322	none	...	annually	soft	good	enough	enough	spring	spring	groundwater	communal standpipe	communal standpipe
1	8776	0.0	06-03-13	Grumeti	1399	GRUMETI	34.698766	-2.147466	Zahanati	...	never pay	soft	good	insufficient	insufficient	rainwater harvesting	rainwater harvesting	surface	communal standpipe	communal standpipe
2	34310	25.0	25-02-13	Lottery Club	686	World vision	37.460664	-3.821329	Kwa Mahundi	...	per bucket	soft	good	enough	enough	dam	dam	surface	communal standpipe multiple	communal standpipe
3	67743	0.0	28-01-13	Unicef	263	UNICEF	38.486161	-11.155298	Zahanati Ya Nanyumbu	...	never pay	soft	good	dry	dry	machine dbh	borehole	groundwater	communal standpipe multiple	communal standpipe
4	19728	0.0	13-07-11	Action In A	0	Artisan	31.130847	-1.825359	Shuleni	...	never pay	soft	good	seasonal	seasonal	rainwater harvesting	rainwater harvesting	surface	communal standpipe	communal standpipe
5	9944	20.0	13-03-11	Mkinga Distric Coun	0	DWE	39.172796	-4.765587	Tajiri	...	per bucket	salty	salty	enough	enough	other	other	unknown	communal standpipe multiple	communal standpipe

	id	amount_tsh	gps_height	longitude	latitude	num_private	region_code	district_code	population	construction_year
count	59400.000000	59400.000000	59400.000000	59400.000000	5.940000e+04	59400.000000	59400.000000	59400.000000	59400.000000	59400.000000
mean	37115.131768	317.650385	668.297239	34.077427	-5.706033e+00	0.474141	15.297003	5.629747	179.909983	1300.652475
std	21453.128371	2997.574558	693.116350	6.567432	2.946019e+00	12.236230	17.587406	9.633649	471.482176	951.620547
min	0.000000	0.000000	-90.000000	0.000000	-1.164944e+01	0.000000	1.000000	0.000000	0.000000	0.000000
25%	18519.750000	0.000000	0.000000	33.090347	-8.540621e+00	0.000000	5.000000	2.000000	0.000000	0.000000
50%	37061.500000	0.000000	369.000000	34.908743	-5.021597e+00	0.000000	12.000000	3.000000	25.000000	1986.000000
75%	55656.500000	20.000000	1319.250000	37.178387	-3.326156e+00	0.000000	17.000000	5.000000	215.000000	2004.000000
max	74247.000000	350000.000000	2770.000000	40.345193	-2.000000e-08	1776.000000	99.000000	80.000000	30500.000000	2013.000000