In [54]:
import pandas as pd
import numpy as np

In [55]:
data = pd.read_csv("student-alcohol-consumption/student-mat.csv")

In [60]:
data.head()


Out[60]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... internet romantic famrel freetime goout health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... no no 4 3 4 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... yes no 5 3 3 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... yes no 4 3 2 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... yes yes 3 2 2 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... no no 4 3 2 5 4 6 10 10

5 rows × 31 columns


In [57]:
y = np.array(data[["Dalc","Walc"]])

In [58]:
labels = 2*y[:,0] + y[:,1]

In [59]:
data.drop(["Dalc","Walc"], inplace=True, axis = 1)

In [62]:
data.drop(["school", "age", "reason","guardian", "schoolsup", "famsup", "nursery", "higher","internet", "romantic", "freetime","health", "absences"], inplace = True, axis = 1)

In [63]:
data


Out[63]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout G1 G2 G3
0 F U GT3 A 4 4 at_home teacher 2 2 0 no no 4 4 5 6 6
1 F U GT3 T 1 1 at_home other 1 2 0 no no 5 3 5 5 6
2 F U LE3 T 1 1 at_home other 1 2 3 yes no 4 2 7 8 10
3 F U GT3 T 4 2 health services 1 3 0 yes yes 3 2 15 14 15
4 F U GT3 T 3 3 other other 1 2 0 yes no 4 2 6 10 10
5 M U LE3 T 4 3 services other 1 2 0 yes yes 5 2 15 15 15
6 M U LE3 T 2 2 other other 1 2 0 no no 4 4 12 12 11
7 F U GT3 A 4 4 other teacher 2 2 0 no no 4 4 6 5 6
8 M U LE3 A 3 2 services other 1 2 0 yes no 4 2 16 18 19
9 M U GT3 T 3 4 other other 1 2 0 yes yes 5 1 14 15 15
10 F U GT3 T 4 4 teacher health 1 2 0 yes no 3 3 10 8 9
11 F U GT3 T 2 1 services other 3 3 0 no yes 5 2 10 12 12
12 M U LE3 T 4 4 health services 1 1 0 yes yes 4 3 14 14 14
13 M U GT3 T 4 3 teacher other 2 2 0 yes no 5 3 10 10 11
14 M U GT3 A 2 2 other other 1 3 0 no no 4 2 14 16 16
15 F U GT3 T 4 4 health other 1 1 0 no no 4 4 14 14 14
16 F U GT3 T 4 4 services services 1 3 0 yes yes 3 3 13 14 14
17 F U GT3 T 3 3 other other 3 2 0 no yes 5 2 8 10 10
18 M U GT3 T 3 2 services services 1 1 3 no yes 5 5 6 5 5
19 M U LE3 T 4 3 health other 1 1 0 yes yes 3 3 8 10 10
20 M U GT3 T 4 3 teacher other 1 2 0 no no 4 1 13 14 15
21 M U GT3 T 4 4 health health 1 1 0 yes no 5 2 12 15 15
22 M U LE3 T 4 2 teacher other 1 2 0 no yes 4 1 15 15 16
23 M U LE3 T 2 2 other other 2 2 0 no yes 5 4 13 13 12
24 F R GT3 T 2 4 services health 1 3 0 yes yes 4 2 10 9 8
25 F U GT3 T 2 2 services services 1 1 2 yes no 1 2 6 9 8
26 M U GT3 T 2 2 other other 1 1 0 yes no 4 2 12 12 11
27 M U GT3 T 4 2 health services 1 1 0 yes no 2 4 15 16 15
28 M U LE3 A 3 4 services other 1 2 0 no yes 5 3 11 11 11
29 M U GT3 T 4 4 teacher teacher 1 2 0 yes yes 4 5 10 12 11
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
365 M R GT3 T 1 3 at_home other 2 2 0 yes no 3 4 10 10 10
366 M U LE3 T 4 4 teacher services 2 3 0 yes no 4 2 13 13 13
367 F R GT3 T 1 1 other services 3 1 1 yes no 5 1 7 6 0
368 F U GT3 T 2 3 at_home services 2 1 0 yes no 5 3 11 10 10
369 F R GT3 T 4 4 other teacher 3 2 0 yes no 3 2 14 12 11
370 F U LE3 T 3 2 services services 2 2 2 no yes 3 2 7 7 9
371 M R LE3 T 1 2 at_home services 3 1 0 yes yes 4 3 14 12 12
372 F U GT3 T 2 2 other at_home 1 3 0 no yes 3 3 13 11 11
373 F R GT3 T 1 2 other other 1 1 0 no yes 3 5 6 5 5
374 F R LE3 T 4 4 other other 2 3 0 no no 5 4 19 18 19
375 F R GT3 T 1 1 other other 4 3 0 no no 4 2 8 8 10
376 F U GT3 T 4 2 health other 2 3 2 yes no 5 3 15 14 15
377 F R LE3 T 4 4 teacher services 1 2 0 yes yes 5 3 8 9 10
378 F U GT3 T 3 3 other other 1 2 0 yes no 4 3 15 15 15
379 F R GT3 T 3 1 at_home other 1 2 0 yes yes 4 4 10 10 10
380 M U GT3 T 4 4 teacher teacher 1 2 0 yes yes 3 4 15 14 14
381 M R GT3 T 2 1 other other 2 1 0 no yes 4 3 7 6 7
382 M U GT3 T 2 3 other services 2 2 0 no yes 4 3 11 11 10
383 M R GT3 T 1 1 other services 2 1 1 no no 4 2 6 5 0
384 M R GT3 T 4 2 other other 2 1 1 yes no 5 3 6 5 5
385 F R GT3 T 2 2 at_home other 2 3 0 yes no 5 3 10 9 10
386 F R GT3 T 4 4 teacher at_home 3 1 0 yes yes 4 3 6 5 6
387 F R GT3 T 2 3 services other 1 3 1 no yes 5 2 7 5 0
388 F U LE3 T 3 1 teacher services 1 2 0 yes no 4 4 7 9 8
389 F U GT3 T 1 1 other other 2 2 1 no yes 1 1 6 5 0
390 M U LE3 A 2 2 services services 1 2 2 yes no 5 4 9 9 9
391 M U LE3 T 3 1 services services 2 1 0 no no 2 5 14 16 16
392 M R GT3 T 1 1 other other 1 1 3 no no 5 3 10 8 7
393 M R LE3 T 3 2 services other 3 1 0 no no 4 1 11 12 10
394 M U LE3 T 1 1 other at_home 1 1 0 no no 3 3 8 9 9

395 rows × 18 columns


In [64]:
grades = np.array(data[["G1","G2","G3"]])

In [65]:
per = grades[:,0] + grades[:,1] + grades[:, 2]

In [67]:
per = per*5/3

In [118]:
per.shape


Out[118]:
(395,)

In [69]:
data.drop(["G1", "G2", "G3"], inplace = True, axis = 1)

In [79]:
data.head(10)


Out[79]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout
0 0 0 GT3 A 4 4 at_home teacher 2 2 0 no no 4 4
1 0 0 GT3 T 1 1 at_home other 1 2 0 no no 5 3
2 0 0 LE3 T 1 1 at_home other 1 2 3 yes no 4 2
3 0 0 GT3 T 4 2 health services 1 3 0 yes yes 3 2
4 0 0 GT3 T 3 3 other other 1 2 0 yes no 4 2
5 1 0 LE3 T 4 3 services other 1 2 0 yes yes 5 2
6 1 0 LE3 T 2 2 other other 1 2 0 no no 4 4
7 0 0 GT3 A 4 4 other teacher 2 2 0 no no 4 4
8 1 0 LE3 A 3 2 services other 1 2 0 yes no 4 2
9 1 0 GT3 T 3 4 other other 1 2 0 yes yes 5 1

In [77]:
data['address'].value_counts()


Out[77]:
U    307
R     88
Name: address, dtype: int64

In [ ]:


In [78]:
di = { 'U' : 0, 'R' : 1}
data.replace({'address':di},inplace=True)

In [ ]:


In [80]:
data['famsize'].value_counts()


Out[80]:
GT3    281
LE3    114
Name: famsize, dtype: int64

In [81]:
di = {'LE3' : 0,'GT3' : 1}
data.replace({'famsize':di},inplace = True)

In [82]:
data.head(6)


Out[82]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout
0 0 0 1 A 4 4 at_home teacher 2 2 0 no no 4 4
1 0 0 1 T 1 1 at_home other 1 2 0 no no 5 3
2 0 0 0 T 1 1 at_home other 1 2 3 yes no 4 2
3 0 0 1 T 4 2 health services 1 3 0 yes yes 3 2
4 0 0 1 T 3 3 other other 1 2 0 yes no 4 2
5 1 0 0 T 4 3 services other 1 2 0 yes yes 5 2

In [83]:
di = { 'A' : 0, 'T' : 1}
data.replace({'Pstatus':di},inplace=True)

In [84]:
data.head(6)


Out[84]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout
0 0 0 1 0 4 4 at_home teacher 2 2 0 no no 4 4
1 0 0 1 1 1 1 at_home other 1 2 0 no no 5 3
2 0 0 0 1 1 1 at_home other 1 2 3 yes no 4 2
3 0 0 1 1 4 2 health services 1 3 0 yes yes 3 2
4 0 0 1 1 3 3 other other 1 2 0 yes no 4 2
5 1 0 0 1 4 3 services other 1 2 0 yes yes 5 2

In [87]:
data['Mjob'].value_counts()


Out[87]:
other       141
services    103
at_home      59
teacher      58
health       34
Name: Mjob, dtype: int64

In [93]:
di = { 'teacher' : 0, 'health' : 1, 'services' : 2, 'at_home' : 3, 'other' : 4}
data.replace({'Mjob':di},inplace=True)

In [94]:
data.head(6)


Out[94]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout
0 0 0 1 0 4 4 3 0 2 2 0 no no 4 4
1 0 0 1 1 1 1 3 4 1 2 0 no no 5 3
2 0 0 0 1 1 1 3 4 1 2 3 yes no 4 2
3 0 0 1 1 4 2 1 services 1 3 0 yes yes 3 2
4 0 0 1 1 3 3 4 4 1 2 0 yes no 4 2
5 1 0 0 1 4 3 2 4 1 2 0 yes yes 5 2

In [90]:
data['Fjob'].value_counts()


Out[90]:
other       217
services    111
teacher      29
at_home      20
health       18
Name: Fjob, dtype: int64

In [95]:
di = { 'teacher' : 0, 'health' : 1, 'services' : 2, 'at_home' : 3, 'other' : 4}
data.replace({'Fjob':di},inplace=True)

In [96]:
data.head(6)


Out[96]:
sex address famsize Pstatus Medu Fedu Mjob Fjob traveltime studytime failures paid activities famrel goout
0 0 0 1 0 4 4 3 0 2 2 0 no no 4 4
1 0 0 1 1 1 1 3 4 1 2 0 no no 5 3
2 0 0 0 1 1 1 3 4 1 2 3 yes no 4 2
3 0 0 1 1 4 2 1 2 1 3 0 yes yes 3 2
4 0 0 1 1 3 3 4 4 1 2 0 yes no 4 2
5 1 0 0 1 4 3 2 4 1 2 0 yes yes 5 2

In [98]:
data['paid'].value_counts()


Out[98]:
no     214
yes    181
Name: paid, dtype: int64

In [102]:
di = { 'no' : 0, 'yes' : 1}
data.replace({'paid':di},inplace=True)

In [103]:
di = { 'no' : 0, 'yes' : 1}
data.replace({'activities':di},inplace=True)

In [106]:
data.shape


Out[106]:
(395, 15)

In [105]:
test = np.array(data)

In [109]:
test.shape


Out[109]:
(395, 15)

In [121]:
train = np.zeros((395,16))

In [122]:
train[:,:15] = test[:,:]

In [123]:
train[:,15] = per

In [130]:
train.shape


Out[130]:
(395, 16)

In [132]:
labels = labels // 10

In [134]:
labels.shape


Out[134]:
(395,)

In [156]:
from sklearn.ensemble import RandomForestClassifier

In [157]:
clf = RandomForestClassifier(n_estimators=60)

In [158]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2)

In [159]:
X_test.shape


Out[159]:
(79, 16)

In [160]:
y_test.shape


Out[160]:
(79,)

In [161]:
clf.fit(X_train, y_train)


Out[161]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [162]:
y_A = clf.predict(X_test)

In [171]:
y_A


Out[171]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [172]:
y_test


Out[172]:
array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [177]:
100*float((y_A==y_test).sum())/y_test.shape[0]


Out[177]:
92.40506329113924