This dataset contains ranking information of primary schools according to performance in primary school leaving certificate examinations.
In [65]:
%matplotlib inline
from collections import defaultdict
import json
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
import seaborn as sns
sns.set_context("talk")
sns.set_style("white")
In [3]:
df = pd.read_csv('PrimarySchoolsPerfomanceAndLocation-2014.csv')
df.shape
Out[3]:
In [8]:
df.columns
Out[8]:
In [9]:
col =['NAME','REGION','DISTRICT','OWNERSHIP','PASS_RATE','AVG_MARK','CHANGE_PREVIOUS_YEAR','RANK']
for c in df.columns:
if c not in col:
df=df.drop(c,axis=1)
In [10]:
df.shape
Out[10]:
In [11]:
df.OWNERSHIP.unique()
Out[11]:
In [12]:
df.head(10)
Out[12]:
In [13]:
df.shape[0]
Out[13]:
In [14]:
print df.OWNERSHIP.unique()
how many do not have type or they are empty?
In [15]:
df[df.OWNERSHIP.isnull()].shape[0]
Out[15]:
Sample of school which ownership is empty
In [16]:
df[df.OWNERSHIP.isnull()].head(10)
Out[16]:
In [17]:
df[df.OWNERSHIP.notnull()].head(10)
Out[17]:
and they are 13033
In [18]:
df[df.OWNERSHIP.notnull()].shape[0]
Out[18]:
Lets plot a pie chat to visualize the data
In [19]:
government_schools =sum(df.OWNERSHIP=='GOVERNMENT') #Government schools
nongovernment_schools =sum(df.OWNERSHIP=='NON GOVERNMENT') #nongovernment schools
unknown = sum(df.OWNERSHIP.isnull()) #number of shools with unknown ownership
schl=df.shape[0] #number of schools
In [20]:
Labels =['Govenment', 'Non Government','Unknown']
fractions =[float(government_schools)/schl, float(nongovernment_schools)/schl, float(unknown)/schl] #percentage
colors = ['yellowgreen', 'gold', 'lightskyblue'] #colors for pie chart
explode = (0, 0, 0) #only explode the first slice
plt.pie(fractions, explode=explode, labels=Labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.show()
Lets look and see summary of the Pass Rate, Avarage Mark and Change compared to previous year
In [22]:
df[['PASS_RATE','AVG_MARK','CHANGE_PREVIOUS_YEAR']].describe()
Out[22]:
The avarage Passing rate was 55.93% and Maximum passing rate was 100%(All students passed examination) while lowest was 0(Nobody passed)
Avarage mark was 108.7 maximum avarage mark per school was 234.7 and minimum was 46.62
NB:These marks are for 5 subjects hence total of 250 marks
In [41]:
df[df.PASS_RATE == 100 ]
Out[41]:
In [42]:
print "There were %s schools which had 100 pass rate"%sum(df.PASS_RATE == 100 )
In [50]:
df[(df.PASS_RATE == 100) & (df.OWNERSHIP=="GOVERNMENT")].describe()
Out[50]:
In [52]:
print "Sample of Government school which have 100% passing rate"
df[(df.PASS_RATE == 100) & (df.OWNERSHIP=="GOVERNMENT")].head(10)
Out[52]:
In [56]:
grouped = df[df.PASS_RATE == 100 ].groupby(df['REGION'])
passed_per_region = grouped.count()
passed_per_region.NAME
Out[56]:
Dar es salaam leads with 121 schools and Katavi is the last with 7
In [63]:
df.groupby(df.REGION).count().NAME
Out[63]:
MBEYA is leading by having 1046
In [76]:
sns.set_context("notebook")
#lets get mean Pass Rate
mean_pass = df.PASS_RATE.mean()
print mean_pass, df.PASS_RATE.median()
with sns.axes_style("whitegrid"):
df.PASS_RATE.hist(bins=30, alpha=0.4);
plt.axvline(mean_pass, 0, 0.75, color='r', label='Mean')
plt.xlabel("Pass Rate")
plt.ylabel("Counts")
plt.title("Passing Rate Hisyogram")
plt.legend()
sns.despine()
In [81]:
with sns.axes_style("whitegrid"):
df.CHANGE_PREVIOUS_YEAR.hist(bins=15, alpha=0.6, color='r');
plt.xlabel("change of passing rate comapred to 2013")
plt.ylabel("school number")
plt.title("Change of passing rate Hisyogram")
plt.legend()
In [84]:
with sns.axes_style("whitegrid"):
df.AVG_MARK.hist(bins=40,alpha=0.6, color='g')
plt.xlabel("Avarage mark per school")
plt.ylabel("school number")
plt.title("Avarage Marks Hisyogram")
plt.legend()