In [3]:
# Load the relevant libraries
import pandas as pd # for data import and dissection
import seaborn as sns # For data visulaization
sns.set(style="whitegrid",color_codes=True)
# Reading the data where low_memory=False increases the program efficiency
data= pd.read_csv("data-taarifa.csv", low_memory=False)
# Making two copies of the data
sub1=data.copy()
sub2=data.copy()
In [5]:
# Checking the data type of the variables
sub1.dtypes
Out[5]:
In [6]:
# A quick look at the data
sub1.head(6)
Out[6]:
In [7]:
sub1.describe()
Out[7]:
In [8]:
sub1['extraction_type_class'].value_counts()
Out[8]:
In [9]:
sub1['payment_type'].value_counts()
Out[9]:
In [10]:
sub1['quality_group'].value_counts()
Out[10]:
In [11]:
sub1['quantity_group'].value_counts()
Out[11]:
In [12]:
sub1['waterpoint_type_group'].value_counts()
Out[12]:
In [13]:
sub1['water_quality'].value_counts()
Out[13]:
In [14]:
sub1['source_type'].value_counts()
Out[14]:
In [15]:
from sklearn.preprocessing import LabelEncoder
var_mod = ['extraction_type_class','payment_type','quality_group','quantity_group','waterpoint_type_group','water_quality','source_type']
le = LabelEncoder()
for i in var_mod:
sub1[i] = le.fit_transform(sub1[i])
sub1.dtypes
Out[15]:
In [17]:
%matplotlib inline
sub1['permit'].hist(bins=10)
Out[17]:
In [18]:
t1=pd.crosstab(sub2['water_quality'],sub2['source_type'])
t1.plot(kind='hist', stacked=True, grid=False, legend=True, title="Water quality based on type of water source")
Out[18]:
In [60]:
t2=pd.crosstab(sub2['source_type'],sub2['payment_type'])
t2.plot(kind='hist', stacked=True, grid=False, legend=True, title="Water quality and types of payment")
Out[60]:
In [20]:
%matplotlib inline
sns.violinplot(x=sub1.extraction_type_class, y=sub1.source_type)
Out[20]:
In [21]:
%matplotlib inline
sns.pointplot(x="extraction_type_class", y="water_quality", data=sub1)
Out[21]:
In [41]:
%matplotlib inline
sns.violinplot(x="waterpoint_type_group", y="source_type", hue="water_quality", data=sub1)
Out[41]:
At a certain point, the categorical scatterplot approach becomes limited in the information it can provide about the distribution of values within each category. There are several ways to summarize this information in ways that facilitate easy comparisons across the category levels.
This kind of plot shows the three quartile values of the distribution along with extreme values. The “whiskers” extend to points that lie within 1.5 IQRs of the lower and upper quartile, and then observations that fall outside this range are displayed independently. Importantly, this means that each value in the boxplot corresponds to an actual observation in the data.
In [27]:
sns.boxplot(x="source_type", y="payment_type",data=sub1)
Out[27]:
A different approach is a violinplot(), which combines a boxplot with the kernel density estimation procedure.
In [36]:
sns.violinplot(x="extraction_type_class", y="source_type", scale="count", data=sub1)
Out[36]:
In [39]:
sns.violinplot(x="extraction_type_class", y="source_type", split="True", data=sub1)
Out[39]:
In [43]:
sns.barplot(x="extraction_type_class", y="source_type", data=sub1)
Out[43]:
In [48]:
sns.barplot(x="quantity_group", y="quality_group", data=sub1)
Out[48]:
A special case for the bar plot is when you want to show the number of observations in each category rather than computing a statistic for a second variable. This is similar to a histogram over a categorical, rather than quantitative, variable. In seaborn, it’s easy to do so with the countplot() function
In [61]:
sns.countplot(x="quality_group", data=data)
Out[61]:
In [55]:
sns.countplot(y="payment_type", data=sub1)
Out[55]:
An alternative style for visualizing the same information is offered by the pointplot() function. This function also encodes the value of the estimate with height on the other axis, but rather than show a full bar it just plots the point estimate and confidence interval. Additionally, pointplot connects points from the same hue category. This makes it easy to see how the main relationship is changing as a function of a second variable
In [56]:
sns.pointplot(x="extraction_type_class", y="waterpoint_type_group", data=sub1)
Out[56]:
In [58]:
sns.pointplot(x="extraction_type_class", y="water_quality", markers=["^", "o"], linestyles=["-", "--"],data=sub1)
Out[58]:
In [ ]: