In [1]:
import pandas as pd # for data import and dissection
import numpy as np # for data analysis
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
plt.interactive(False)
sns.set(style="whitegrid",color_codes=True)
In [3]:
# Reading the data where low_memory=False increases the program efficiency
data= pd.read_csv("data-taarifa.csv", low_memory=False)
sub1=data.copy()
In [4]:
list(sub1.columns.values)
Out[4]:
In [5]:
#lowercase all variables
sub1.columns = [x.lower() for x in sub1.columns]
A quick peek at the dataset
In [6]:
sub1.head(5)
Out[6]:
In [7]:
## To fill every column with its own most frequent value you can use
sub1 = sub1.apply(lambda x:x.fillna(x.value_counts().index[0]))
In [8]:
from sklearn import preprocessing
le_enc = preprocessing.LabelEncoder()
#to convert into numbers
sub1.permit = le_enc.fit_transform(sub1.permit)
sub1.extraction_type_class=le_enc.fit_transform(sub1.extraction_type_class)
sub1.payment_type=le_enc.fit_transform(sub1.payment_type)
sub1.quality_group=le_enc.fit_transform(sub1.quality_group)
sub1.quantity_group=le_enc.fit_transform(sub1.quantity_group)
sub1.waterpoint_type_group=le_enc.fit_transform(sub1.waterpoint_type_group)
sub1.water_quality=le_enc.fit_transform(sub1.water_quality)
sub1.source_class=le_enc.fit_transform(sub1.source_class)
sub1.status_group=le_enc.fit_transform(sub1.status_group)
Another quick peek at the dataset. Notice, variables of interest like 'permit', 'extraction_type', 'payment_type', 'quality_group' have now been assigned dummy codes as required
In [9]:
sub1.head(5)
Out[9]:
status_group,extraction_type_class,payment_type,quality_group,quantity_group,waterpoint_type_group,source_class,permit,water_quality
In [10]:
print ("OLS regresssion model for the association between water pump condition status and quality of water in it")
reg1=smf.ols('status_group~permit',data=sub1).fit()
print (reg1.summary())
Status group is the response or dependent variable and permit is the independent variable.
The number of observations show the no. of observations that had valid data and thus were included in the analysis. The F-statistic is 66.57 and the p value is very small (Prob (F-statistic))= 3.44e-16 considerably less than our alpha level of 0.05 which tell us that we can reject the null hypothesis and conclude that permit is significantly associated with water pump status group.
The linear regression equation Y = b0 + b1X where X is the explanatory variable or the independent variable and Y is the response or the dependent variable.--(EQN 1)
Note: EQN 1 is significant because it can also help us in prediction of Y. Next, we look at the parameter estimates or the coeffecients or beta weights . Thus the coeffecient for permit is -0.0697 and the intercept is 0.8903.
Than the best fit line for permit is; status_group=0.89+0.06*permit -- (EQN 2)
In the above example, lets say we are told that a country has 80% people with valid water permits than can we predict the status of the water pump device?
Yes, we plug the value of 80 in EQN 2 as given b0 = 0.89, b1 = 0.06 permit= 80
Than, y(hat) = 0.89+0.06*80 y(hat)= 5.69 or we can say that for 80% people with valid permits there will be approximately 6% water pumps that are functional
Also note the P>|t| value is very small for permit. It is 0.0 and that the R-squared value is 0.001.
We now know that this model accounts for 0.001% variability that we see in our response variable permit.
In [11]:
# Now, I continue to add the variables to this model to check for any loss of significance
print ("OLS regresssion model for the association between status_group and other variables of interest")
reg1=smf.ols('status_group~quantity_group+extraction_type_class+waterpoint_type_group',data=sub1).fit()
print (reg1.summary())
In [12]:
print ("OLS regresssion model for the association between water pump status group and all variables of interest")
reg1=smf.ols('status_group~extraction_type_class+payment_type+quality_group+quantity_group+waterpoint_type_group+source_class+permit+water_quality',data=sub1).fit()
print (reg1.summary())
In [13]:
scat1 = sns.regplot(x="status_group", y="quality_group", order=2, scatter=True, data=sub1)
plt.xlabel('Water pump status')
plt.ylabel ('quality of the water')
plt.title ('Scatterplot for the association between water pump status and water quality')
#print scat1
plt.show()
In [ ]: