In [54]:
import pandas as pd
import numpy as np
In [55]:
data = pd.read_csv("student-alcohol-consumption/student-mat.csv")
In [60]:
data.head()
Out[60]:
In [57]:
y = np.array(data[["Dalc","Walc"]])
In [58]:
labels = 2*y[:,0] + y[:,1]
In [59]:
data.drop(["Dalc","Walc"], inplace=True, axis = 1)
In [62]:
data.drop(["school", "age", "reason","guardian", "schoolsup", "famsup", "nursery", "higher","internet", "romantic", "freetime","health", "absences"], inplace = True, axis = 1)
In [63]:
data
Out[63]:
In [64]:
grades = np.array(data[["G1","G2","G3"]])
In [65]:
per = grades[:,0] + grades[:,1] + grades[:, 2]
In [67]:
per = per*5/3
In [118]:
per.shape
Out[118]:
In [69]:
data.drop(["G1", "G2", "G3"], inplace = True, axis = 1)
In [79]:
data.head(10)
Out[79]:
In [77]:
data['address'].value_counts()
Out[77]:
In [ ]:
In [78]:
di = { 'U' : 0, 'R' : 1}
data.replace({'address':di},inplace=True)
In [ ]:
In [80]:
data['famsize'].value_counts()
Out[80]:
In [81]:
di = {'LE3' : 0,'GT3' : 1}
data.replace({'famsize':di},inplace = True)
In [82]:
data.head(6)
Out[82]:
In [83]:
di = { 'A' : 0, 'T' : 1}
data.replace({'Pstatus':di},inplace=True)
In [84]:
data.head(6)
Out[84]:
In [87]:
data['Mjob'].value_counts()
Out[87]:
In [93]:
di = { 'teacher' : 0, 'health' : 1, 'services' : 2, 'at_home' : 3, 'other' : 4}
data.replace({'Mjob':di},inplace=True)
In [94]:
data.head(6)
Out[94]:
In [90]:
data['Fjob'].value_counts()
Out[90]:
In [95]:
di = { 'teacher' : 0, 'health' : 1, 'services' : 2, 'at_home' : 3, 'other' : 4}
data.replace({'Fjob':di},inplace=True)
In [96]:
data.head(6)
Out[96]:
In [98]:
data['paid'].value_counts()
Out[98]:
In [102]:
di = { 'no' : 0, 'yes' : 1}
data.replace({'paid':di},inplace=True)
In [103]:
di = { 'no' : 0, 'yes' : 1}
data.replace({'activities':di},inplace=True)
In [106]:
data.shape
Out[106]:
In [105]:
test = np.array(data)
In [109]:
test.shape
Out[109]:
In [121]:
train = np.zeros((395,16))
In [122]:
train[:,:15] = test[:,:]
In [123]:
train[:,15] = per
In [130]:
train.shape
Out[130]:
In [132]:
labels = labels // 10
In [134]:
labels.shape
Out[134]:
In [156]:
from sklearn.ensemble import RandomForestClassifier
In [157]:
clf = RandomForestClassifier(n_estimators=60)
In [158]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2)
In [159]:
X_test.shape
Out[159]:
In [160]:
y_test.shape
Out[160]:
In [161]:
clf.fit(X_train, y_train)
Out[161]:
In [162]:
y_A = clf.predict(X_test)
In [171]:
y_A
Out[171]:
In [172]:
y_test
Out[172]:
In [177]:
100*float((y_A==y_test).sum())/y_test.shape[0]
Out[177]: