In [1]:
# loading dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
# suppress all warnings
import warnings
warnings.filterwarnings('ignore')
marks = pd.read_csv('gramener-usecase-nas/nas-pupil-marks.csv')
labels = pd.read_csv('gramener-usecase-nas/nas-labels.csv')
In [2]:
marks.head(3)
Out[2]:
In [3]:
# Column names of dataset
marks.columns
Out[3]:
In [4]:
# The shape of data
marks.shape
Out[4]:
In [5]:
# Splitting the columns into independent categories and performance
category = ['State', 'District', 'Gender', 'Age', 'Category',
'Same language', 'Siblings', 'Handicap', 'Father edu', 'Mother edu',
'Father occupation', 'Mother occupation', 'Below poverty',
'Use calculator', 'Use computer', 'Use Internet', 'Use dictionary',
'Read other books', '# Books', 'Distance', 'Computer use',
'Library use', 'Like school', 'Subjects', 'Give Lang HW',
'Give Math HW', 'Give Scie HW', 'Give SoSc HW', 'Correct Lang HW',
'Correct Math HW', 'Correct Scie HW', 'Correct SocS HW',
'Help in Study', 'Private tuition', 'English is difficult',
'Read English', 'Dictionary to learn', 'Answer English WB',
'Answer English aloud', 'Maths is difficult', 'Solve Maths',
'Solve Maths in groups', 'Draw geometry', 'Explain answers',
'SocSci is difficult', 'Historical excursions', 'Participate in SocSci',
'Small groups in SocSci', 'Express SocSci views',
'Science is difficult', 'Observe experiments', 'Conduct experiments',
'Solve science problems', 'Express science views', 'Watch TV',
'Read magazine', 'Read a book', 'Play games', 'Help in household']
performance = ['Maths %', 'Reading %', 'Science %', 'Social %']
# unique values in each category
for c in category:
print (c,":",marks[c].unique())
We defined a performance metric as 'performance' = average of ('Maths %', 'Reading %', 'Science %', 'Social %'). A feature selection is performed based on SelectKBest
to evaluvate the relative importance in predicting performance. The top features were found for each subject as well as for average of all.
Parameter | Best Feature |
---|---|
Overall Performance | 'Father edu' |
Maths | 'Help in household' |
Reading | 'Mother edu' |
Science | 'Father edu' |
Social | 'Help in household' |
This concludeds that the education of parents were the most decisive predictor in deciding a student's performance. Among top features, 'Father edu' has distinctly higher scores for performance, almost 33% higher than the second feature indicating a very high relevance.
In [6]:
# adding performance column as the average of all scores
# np.nanmean is used across the horizontal axis to avoid "NaN" values to calculate mean.
marks["performance"]=marks[performance].apply(np.nanmean, axis=1)
marks["performance"].describe()
Out[6]:
In [7]:
# no. of null values and corresponding columns
pd.isnull(marks).sum()[pd.isnull(marks).sum()!=0]
Out[7]:
In [8]:
# Plotting performance based on each category value in the dataframe
for c in category:
marks.boxplot(column="performance", by=c,figsize=(8, 4))
print (labels[labels["Column"]==c])
plt.suptitle("")
plt.show()
In [9]:
def clean_data(marks, feature_labels, y_col_name="performance"):
"""
Cleans data: removes data rows with NA values in the target variable and
encode categorical variables.
Also split the data into features and target variable
Parameters
----------
data(marks) : tidy data with features and target variable
feature_labels : strings containing labels of features
y_col_name : target variable label
Returns
------
X : Features in (pandas dataframe)
y: target variable (pandas series)
"""
# Creating traning set X and target y
from sklearn.preprocessing import LabelEncoder
# Remove all rows with performance is undefined i.e. "NaN"
marks_nona = marks.dropna(subset=[y_col_name])
# Cloning marks to make a training set X
X = marks_nona[feature_labels].copy(deep=True)
# string encoded columns are converted to np array to create training set x
encoded_columns = ["State","Use computer", "Subjects"]
le_state = LabelEncoder()
le_subject = LabelEncoder()
le_use_comp = LabelEncoder()
X["State"] = le_state.fit_transform(X["State"])
X["Subjects"] = le_subject.fit_transform(X["Subjects"])
X["Use computer"] = le_use_comp.fit_transform(X["Use computer"].fillna(value="0"))
print("Shape of X\t:",X.shape)
# target variable y
y = marks_nona[y_col_name]
print ("Shape of y\t:",y.shape)
return X, y
In [10]:
def best_features(X,y):
"""
Calculate feature scores based on "SelectKBest" and plot for each feature.
Parameters
----------
X: features with headers
y: Target variable
Returns
-------
sorted_scores: tuple with this format --> (score, feature label)
The tuple is sorted in descending order based on the scores
"""
# Pipeline is defined for feature selection
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe = Pipeline(steps = [('scaler', StandardScaler()),\
('selK', SelectKBest(k="all",score_func=f_regression))])
pipe.fit(X.astype(float).values,y.astype(float).values)
# score of top 10 features is sorted in descending order
k_scores = pipe.named_steps["selK"].scores_
# k_scores = k_scores/sum(k_scores)
scores_tuple = zip(X.columns,k_scores)
sorted_scores = sorted(scores_tuple,key=lambda score:score[1], reverse=True)
print ("Best 5 Features:\n",sorted_scores[:5])
# Plotting the best features
fig=plt.figure(figsize=(12,5))
plt.bar(np.arange(len(k_scores)),k_scores,label = X.columns)
plt.axhline(y=max(k_scores)*0.6,color='b',linewidth=.5)
plt.xticks(np.arange(len(k_scores)),X.columns,rotation="vertical")
plt.ylabel("K-Scores")
plt.show()
return sorted_scores
def tuple_to_dict(scores_tuple):
"""
converts an array of tuple (key,value) to a dictionary {key:value}
"""
i = 0;
my_dict = {}
while i<len(scores_tuple):
key, value = scores_tuple[i]
my_dict.update({key : value})
i = i+1
return my_dict
In [11]:
target = ["performance","Maths %","Reading %","Science %","Social %"]
final_stat = {}
for t in target:
print("======\tTarget Variable\t:",t,"======")
X, y = clean_data(marks,category,y_col_name=t)
sorted_scores = best_features(X,y)
final_stat.update({t : tuple_to_dict(sorted_scores)})
best_f = sorted_scores[0][0]
# plotting
marks.boxplot(column=t, by=best_f,figsize=(6,5))
plt.title(t)
plt.tight_layout()
plt.suptitle("")
print (labels[labels["Column"]==best_f])
plt.show()
In [31]:
# K-scores are normalized by maximum score (along each row) to show the strongest predictor for each row
import seaborn as sns
s = pd.DataFrame.from_dict(final_stat, orient='index')
final = s.divide(s.max(axis=1),axis = 0)
# plotting heat map
plt.figure(figsize=(15,2))
f = sns.heatmap(final, linewidths = 1,square= False, cbar= False, cmap = "YlGnBu",vmax = 1, vmin =0)
plt.suptitle("Coloring based on K-Score (Darker color indicates stronger influence)")
plt.show()
Across most states Girls tend to have a higher median performance than boys.
Some states with notable exception to this rule are Jharkand (JH) and Bihar (BH). Since these state have high gender inequality, this trend could be due to lack of access/support to educational resources to girls compared to boys. There could be other intrinsic reasons as it needs additional supporting data.
Of the states where girls perform better, Kerala and Delhi stands out. Both states showed that median performance of girls is almost 4% higher than boys.
In [30]:
# Eliminating Gender not equal to 1 or 2
marks_gender = marks.dropna(subset=["performance"])
marks_gender =marks_gender[marks_gender["Gender"]!=0]
marks_gender.boxplot(column="performance", by=["State","Gender"], figsize=(12, 3))
plt.xticks(rotation="vertical")
plt.title("")
plt.show()
In [14]:
# Aggregating median performance based on "State" and "Gender"
G_perfomance = marks_gender.groupby(["State","Gender"]).median()["performance"].reset_index()
# pivoting the dataframe to include columns "Boy" and "Girl" and renaming them
G_perfomance = G_perfomance.pivot(index='State', columns='Gender', values='performance')
G_perfomance.columns = ["Boy","Girl"]
# Adding column "diff" with the differenc ein median performance of boys and girls
G_perfomance["diff"]=G_perfomance["Boy"]-G_perfomance["Girl"]
G_perfomance.head()
Out[14]:
In [15]:
# Aggregating count of Gender based on "State" and "Gender"
G_count = marks_gender.groupby(["State","Gender"]).count()[["STUID"]].reset_index()
# pivoting the dataframe to include columns "Boy" and "Girl" and renaming them
G_count = G_count.pivot(index='State', columns='Gender', values='STUID')
G_count.columns = ["Boy","Girl"]
G_count["ratio"] = G_count["Boy"]/G_count["Boy"]
G_count.head()
Out[15]:
In [16]:
import matplotlib.colors as colors
from matplotlib.cm import bwr as cmap
import matplotlib.patches as mpatches
plt.figure(figsize=(12,5))
# setting colors. Maps the max and min values in "diff" to a color map bwr
c_normal = colors.PowerNorm(.1,vmin=min(G_perfomance["diff"]), vmax=max(G_perfomance["diff"]))
_COLORS = cmap(c_normal(G_perfomance["diff"]))
plt.bar(np.arange(len(G_perfomance["diff"])),
height = G_perfomance["diff"], width = 0.75, align = "center",\
color=_COLORS)
plt.xticks(np.arange(len(G_perfomance.index)),list(G_perfomance.index))
plt.axhline(0, color='k', linewidth = 0.5)
plt.xlabel("State")
plt.ylabel("Median Performance Difference\n(Boys - Girls)")
# creating legend patches
red_patch = mpatches.Patch(color='red', label='Boys Perform Better')
blue_patch = mpatches.Patch(color='blue', label='Girls Perform Better')
plt.legend(handles=[red_patch, blue_patch], loc=4)
plt.show()
In [17]:
fig = plt.figure(figsize = (5,14),dpi=75)
ax = fig.add_subplot(111)
ax2 = ax.twiny()
# bar plots on first axis ax
ax.barh(np.arange(len(G_perfomance["Boy"])),\
width = G_perfomance["Boy"],height = 0.1, color="k",\
align = "center", alpha =0.25, linewidth = 0)
ax.barh(np.arange(len(G_perfomance["Girl"])),\
width = -G_perfomance["Girl"],height = 0.1, color="k",\
align = "center", alpha =0.25, linewidth = 0)
# scatter plots on first axis ax with marker size mapped on to sampe size
ax.scatter(x = G_perfomance["Boy"],\
y = np.arange(len(G_perfomance["diff"])),\
s = G_count["Boy"]*0.1,\
color = "k", alpha =0.5)
ax.scatter(x = -G_perfomance["Girl"],\
y = np.arange(len(G_perfomance["diff"])),\
s = G_count["Girl"]*0.1,\
color = "k", alpha =0.5)
# First x-axis
ax.set_xlim(-60, 60)
ax.set_xticklabels([str(abs(x)) for x in ax.get_xticks()]) # changing the x ticks to remove "-"
ax.set_xlabel("Median performance")
for a in [100,500]:
ax.scatter([],[],c='k', alpha=0.5, s=a,label = "{0}".format(a*10))
# Second x-axis
ax2.barh(np.arange(len(G_perfomance["diff"])),
width = G_perfomance["diff"], height = 0.75, align = "center",\
color=_COLORS)
ax2.set_xlim(-10, 10)
ax2.grid(False)
ax2.set_xlabel("Median performance difference (Boys - Girls)")
# y-axis
ax.set_ylim(-1, len(G_perfomance.index)+2)
plt.yticks(np.arange(len(G_perfomance.index)),list(G_perfomance.index))
plt.axvline(x= 0, color='k', linewidth = 0.75, ymax = 0.94)
# legend
red_patch = mpatches.Patch(color='red', label='Boys Perform Better')
blue_patch = mpatches.Patch(color='blue', label='Girls Perform Better')
plt.legend(handles=[blue_patch, red_patch], loc=2, ncol =1,frameon=False)
ax.legend(loc=1,ncol=2,frameon=False)
# annotation patch
tboy = ax.text(50, -2.2, "Boys", ha="center", va="center", rotation=0,
size=10,color = "w",
bbox=dict(boxstyle="rarrow,pad=0.3", fc="grey", ec="b", lw=0))
tgirl = ax.text(-50, -2.2, "Girls", ha="center", va="center", rotation=0,
size=10,color = "w",
bbox=dict(boxstyle="larrow,pad=0.3", fc="grey", ec="b", lw=0))
plt.show()
In [18]:
# Sorted list of States with higher performance for Boys
G_perfomance[G_perfomance["diff"]>0].sort_values("diff", ascending=False)
Out[18]:
In [19]:
# Sorted list of States with higher performance for Girls
G_perfomance[G_perfomance["diff"]<0].sort_values("diff", ascending=True)
Out[19]:
In order to do the analysis, Here we considered southern states as : "Andhra Pradesh", "Kerala", "Karnataka" and "Tamil Nadu". Meanwhile other states are referred to "the rest of the country". The performance score for 'Science and Math' is defined as the mean value of both 'Science' and 'Math'.
We found that central tendendencies of Southern States
to be slight lower than the rest of the country. But it should be noted that the number of samples in the Southern States
is far less. Also, it should be understood that the enrollment rate of southern states is usually higher than rest of country which could be driving down the median values.
To identify if all southern states follow this pattern, we split the data into corresponding southern states. We found that "Kerala" as a notable exception to the trend of southern states. "Kerala" tends to have higher median score than other southern states, rest of the country and the overall median of country. Another exception is the distribution of marks from "Tamil Nadu" with longer tails. "Tamil Nadu" followed the trend of the rest of the country with longer tails at highest end but has lower median score than all others.
In [20]:
marks['math_sci'] = marks[['Maths %','Science %']].apply(np.nanmean,axis=1)
In [21]:
# Defining a dataframe "south" with columns = [state,math_science].
south = marks[['State','Maths %','Science %','math_sci']].dropna(subset=['Maths %','Science %'])
print (south.isnull().sum())
print(south.columns)
In [22]:
# separating southern states from rest of the country
STATES = list(south["State"].unique())
SOUTH_STATES = ["KL", "AP","TN","KA"]
REST = [S for S in STATES if S not in SOUTH_STATES]
south["is_south"] = south["State"].isin(SOUTH_STATES)
# function to add a new column "south_vs_rest"
def add_col_south_vs_rest(south,SOUTH_STATES):
"""
Returns a new lst with south["state"] as
the value if the state is in SOUTH_STATES,
else with the value "Rest"
"""
lst = []
for index in range(south.shape[0]):
state = south.iloc[index]["State"]
if state in SOUTH_STATES:
lst.append(state)
else:
lst.append("Rest")
return lst
south["south_vs_rest"] = add_col_south_vs_rest(south,SOUTH_STATES)
south.tail(2)
Out[22]:
In [23]:
print (south.describe())
south.groupby(by = "is_south").describe()
Out[23]:
In [24]:
for factor in ["math_sci","Maths %","Science %"]:
fig = south.boxplot(column = factor, by ="is_south")
plt.axhline(south[factor].median(), color='k', linewidth = 0.5, linestyle ="--",\
label="Median Score of Country")
plt.ylabel(" %")
fig.set_xticklabels(["rest of the country","southern states"])
plt.suptitle("")
plt.xlabel("")
plt.legend()
plt.show()
In [25]:
plt.figure(1)
sns.countplot(x="is_south", data=south)
plt.figure(2)
sns.countplot(x="south_vs_rest", data=south)
plt.show()
In [26]:
# sns.stripplot(x="is_south", y="math_sci",data=south, jitter=True, alpha=0.2)
plt.figure(figsize = (10,6))
f = sns.violinplot(x="is_south", y="math_sci",data=south, fliersize=0, width = .3, notch =True, linewidth =1)
plt.axhline(south["math_sci"].median(), color='r', linewidth = 1, linestyle ="--",\
label="Median Score of Country")
f.set_xticklabels(["rest of the country","southern states"])
# plotting the count data
plt.scatter(x = south.groupby("is_south").count()["State"].index,\
y = [-10.0,-10.0],\
s = south.groupby("is_south").count()["State"].values/25,\
c="k", linewidth=1,alpha =0.4, label = "Sample size")
plt.suptitle("Math and Science Score")
plt.xlabel("")
plt.ylabel("%")
plt.legend(loc =8)
plt.show()
In [27]:
# spltting each southern state to compare with the rest of the country
for factor in ["math_sci","Maths %","Science %"]:
fig = south.boxplot(column = factor, by ="south_vs_rest")
plt.axhline(south[factor].median(), color='k', linewidth = 0.5, linestyle ="--",\
label="Median Score of Country")
plt.ylabel(" %")
plt.suptitle("")
plt.xlabel("")
plt.legend()
plt.show()
In [28]:
count = south.groupby("south_vs_rest").count()["State"].reindex(index=["Rest","AP","KA","KL","TN"])
In [29]:
# Math and science score for each of the southern state
plt.figure(figsize = (10,6))
f = sns.violinplot(x="south_vs_rest", y="math_sci",data=south,\
fliersize=0, width = .3, notch =True, linewidth =1,\
order=["Rest","AP","KA","KL","TN"])
plt.axhline(south["math_sci"].median(), color='r', linewidth = 1, linestyle ="--",\
label="Median Score of Country")
f.set_xticklabels(["Rest of the\ncountry","Andhra\nPradesh","Karnataka","Kerala", "Tamil\nNadu"])
plt.scatter(x =[0,1,2,3,4], y=[-10,-10,-10,-10,-10],s = count/15,\
label ="Sample size", alpha =0.3,color ="k",linewidth=1,\
linestyle ="solid")
plt.suptitle("Math and Science Score")
plt.xlabel("")
plt.ylabel("%")
plt.legend()
# plt.twinx()
# sns.countplot(x="south_vs_rest", data=south, width = 0.2)
plt.show()