Load the relevant libraries


In [1]:
import pandas as pd # for data import and dissection
import numpy as np # for data analysis
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

set the stage for data visualization


In [2]:
plt.interactive(False)
sns.set(style="whitegrid",color_codes=True)

Load the dataset and make a copy of it


In [3]:
# Reading the data where low_memory=False increases the program efficiency
data= pd.read_csv("data-taarifa.csv", low_memory=False)
sub1=data.copy()

List all the variables in the dataset


In [4]:
list(sub1.columns.values)


Out[4]:
['id',
 'amount_tsh',
 'date_recorded',
 'funder',
 'gps_height',
 'installer',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'subvillage',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'status_group']

Data standardisation- Lowercase all the variable names


In [5]:
#lowercase all variables
sub1.columns = [x.lower() for x in sub1.columns]

A quick peek at the dataset


In [6]:
sub1.head(5)


Out[6]:
id amount_tsh date_recorded funder gps_height installer longitude latitude wpt_name num_private ... water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group status_group
0 69572 6000.0 14-03-11 Roman 1390 Roman 34.938093 -9.856322 none 0 ... soft good enough enough spring spring groundwater communal standpipe communal standpipe functional
1 8776 0.0 06-03-13 Grumeti 1399 GRUMETI 34.698766 -2.147466 Zahanati 0 ... soft good insufficient insufficient rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe functional
2 34310 25.0 25-02-13 Lottery Club 686 World vision 37.460664 -3.821329 Kwa Mahundi 0 ... soft good enough enough dam dam surface communal standpipe multiple communal standpipe functional
3 67743 0.0 28-01-13 Unicef 263 UNICEF 38.486161 -11.155298 Zahanati Ya Nanyumbu 0 ... soft good dry dry machine dbh borehole groundwater communal standpipe multiple communal standpipe non functional
4 19728 0.0 13-07-11 Action In A 0 Artisan 31.130847 -1.825359 Shuleni 0 ... soft good seasonal seasonal rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe functional

5 rows × 41 columns

Problem challenge # 1

Missing value treatment

A majority of the variables in this dataset are categorical. Therefore, I treat the missing values to the mode (replacement of missing values by most frequently occuring values)


In [7]:
## To fill every column with its own most frequent value you can use
sub1 = sub1.apply(lambda x:x.fillna(x.value_counts().index[0]))

Problem challenge # 2

Using one hot encoding to convert categorical variables of interest to dummy continuous numbers

For reference see this Kaggle post by Mark, https://www.kaggle.com/c/titanic/forums/t/5379/handling-categorical-data-with-sklearn


In [8]:
from sklearn import preprocessing
le_enc = preprocessing.LabelEncoder()
#to convert into numbers
sub1.permit = le_enc.fit_transform(sub1.permit)
sub1.extraction_type_class=le_enc.fit_transform(sub1.extraction_type_class)
sub1.payment_type=le_enc.fit_transform(sub1.payment_type)
sub1.quality_group=le_enc.fit_transform(sub1.quality_group)
sub1.quantity_group=le_enc.fit_transform(sub1.quantity_group)
sub1.waterpoint_type_group=le_enc.fit_transform(sub1.waterpoint_type_group)
sub1.water_quality=le_enc.fit_transform(sub1.water_quality)
sub1.source_class=le_enc.fit_transform(sub1.source_class)
sub1.status_group=le_enc.fit_transform(sub1.status_group)

Another quick peek at the dataset. Notice, variables of interest like 'permit', 'extraction_type', 'payment_type', 'quality_group' have now been assigned dummy codes as required


In [9]:
sub1.head(5)


Out[9]:
id amount_tsh date_recorded funder gps_height installer longitude latitude wpt_name num_private ... water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group status_group
0 69572 6000.0 14-03-11 Roman 1390 Roman 34.938093 -9.856322 none 0 ... 6 2 enough 1 spring spring 0 communal standpipe 1 0
1 8776 0.0 06-03-13 Grumeti 1399 GRUMETI 34.698766 -2.147466 Zahanati 0 ... 6 2 insufficient 2 rainwater harvesting rainwater harvesting 1 communal standpipe 1 0
2 34310 25.0 25-02-13 Lottery Club 686 World vision 37.460664 -3.821329 Kwa Mahundi 0 ... 6 2 enough 1 dam dam 1 communal standpipe multiple 1 0
3 67743 0.0 28-01-13 Unicef 263 UNICEF 38.486161 -11.155298 Zahanati Ya Nanyumbu 0 ... 6 2 dry 0 machine dbh borehole 0 communal standpipe multiple 1 2
4 19728 0.0 13-07-11 Action In A 0 Artisan 31.130847 -1.825359 Shuleni 0 ... 6 2 seasonal 3 rainwater harvesting rainwater harvesting 1 communal standpipe 1 0

5 rows × 41 columns

explanatory variable is also known as the 'Independent variable' and response variable is also known as the 'dependent variable'
Variables of interest for this study are;

status_group,extraction_type_class,payment_type,quality_group,quantity_group,waterpoint_type_group,source_class,permit,water_quality

Correlational Analysis for variables of interest

From a previous post(), I want to determine if there is any relationship between water quality, water quantity and water resource characterstics. So I use the regression methods as shown below to determine this.


In [10]:
print ("OLS regresssion model for the association between water pump condition status and quality of water in it")
reg1=smf.ols('status_group~permit',data=sub1).fit()
print (reg1.summary())


OLS regresssion model for the association between water pump condition status and quality of water in it
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           status_group   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     66.57
Date:                Sun, 19 Jun 2016   Prob (F-statistic):           3.44e-16
Time:                        12:43:01   Log-Likelihood:                -81191.
No. Observations:               59400   AIC:                         1.624e+05
Df Residuals:                   59398   BIC:                         1.624e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.8903      0.007    124.048      0.000         0.876     0.904
permit        -0.0697      0.009     -8.159      0.000        -0.086    -0.053
==============================================================================
Omnibus:                      969.918   Durbin-Watson:                   1.992
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9127.334
Skew:                           0.321   Prob(JB):                         0.00
Kurtosis:                       1.190   Cond. No.                         3.45
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Status group is the response or dependent variable and permit is the independent variable.

The number of observations show the no. of observations that had valid data and thus were included in the analysis. The F-statistic is 66.57 and the p value is very small (Prob (F-statistic))= 3.44e-16 considerably less than our alpha level of 0.05 which tell us that we can reject the null hypothesis and conclude that permit is significantly associated with water pump status group.

The linear regression equation Y = b0 + b1X where X is the explanatory variable or the independent variable and Y is the response or the dependent variable.--(EQN 1)

Note: EQN 1 is significant because it can also help us in prediction of Y. Next, we look at the parameter estimates or the coeffecients or beta weights . Thus the coeffecient for permit is -0.0697 and the intercept is 0.8903.

Than the best fit line for permit is; status_group=0.89+0.06*permit -- (EQN 2)

In the above example, lets say we are told that a country has 80% people with valid water permits than can we predict the status of the water pump device?

Yes, we plug the value of 80 in EQN 2 as given b0 = 0.89, b1 = 0.06 permit= 80

Than, y(hat) = 0.89+0.06*80 y(hat)= 5.69 or we can say that for 80% people with valid permits there will be approximately 6% water pumps that are functional

Also note the P>|t| value is very small for permit. It is 0.0 and that the R-squared value is 0.001.

We now know that this model accounts for 0.001% variability that we see in our response variable permit.


In [11]:
# Now, I continue to add the variables to this model to check for any loss of significance
print ("OLS regresssion model for the association between status_group and other variables of interest")
reg1=smf.ols('status_group~quantity_group+extraction_type_class+waterpoint_type_group',data=sub1).fit()
print (reg1.summary())


OLS regresssion model for the association between status_group and other variables of interest
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           status_group   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     1674.
Date:                Sun, 19 Jun 2016   Prob (F-statistic):               0.00
Time:                        12:43:12   Log-Likelihood:                -78815.
No. Observations:               59400   AIC:                         1.576e+05
Df Residuals:                   59396   BIC:                         1.577e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 0.6823      0.009     72.988      0.000         0.664     0.701
quantity_group           -0.1499      0.005    -32.163      0.000        -0.159    -0.141
extraction_type_class     0.0788      0.002     33.578      0.000         0.074     0.083
waterpoint_type_group     0.1255      0.003     45.401      0.000         0.120     0.131
==============================================================================
Omnibus:                      972.587   Durbin-Watson:                   1.991
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6805.233
Skew:                           0.321   Prob(JB):                         0.00
Kurtosis:                       1.471   Cond. No.                         8.68
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [12]:
print ("OLS regresssion model for the association between water pump status group and all variables of interest")
reg1=smf.ols('status_group~extraction_type_class+payment_type+quality_group+quantity_group+waterpoint_type_group+source_class+permit+water_quality',data=sub1).fit()
print (reg1.summary())


OLS regresssion model for the association between water pump status group and all variables of interest
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           status_group   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     772.1
Date:                Sun, 19 Jun 2016   Prob (F-statistic):               0.00
Time:                        12:43:18   Log-Likelihood:                -78286.
No. Observations:               59400   AIC:                         1.566e+05
Df Residuals:                   59391   BIC:                         1.567e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 0.0689      0.031      2.223      0.026         0.008     0.130
extraction_type_class     0.0703      0.002     28.934      0.000         0.066     0.075
payment_type              0.0044      0.002      2.102      0.036         0.000     0.009
quality_group             0.1315      0.005     26.456      0.000         0.122     0.141
quantity_group           -0.1609      0.005    -34.481      0.000        -0.170    -0.152
waterpoint_type_group     0.1339      0.003     45.544      0.000         0.128     0.140
source_class              0.1596      0.009     17.440      0.000         0.142     0.178
permit                   -0.0045      0.008     -0.551      0.582        -0.021     0.012
water_quality             0.0482      0.004     11.444      0.000         0.040     0.056
==============================================================================
Omnibus:                     1059.760   Durbin-Watson:                   1.993
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6512.760
Skew:                           0.336   Prob(JB):                         0.00
Kurtosis:                       1.524   Cond. No.                         63.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [13]:
scat1 = sns.regplot(x="status_group", y="quality_group", order=2, scatter=True, data=sub1)
plt.xlabel('Water pump status')
plt.ylabel ('quality of the water')
plt.title ('Scatterplot for the association between water pump status and water quality')
#print scat1
plt.show()



In [ ]: