In [1]:
import pandas as pd
path='/Users/vahidpartovinia/Desktop/'
# path is your local file address
# in MS Windows you must write 'C:\\Users\\..'
filename = path+'spamdata.csv'
spam = pd.read_csv(filename)
In [2]:
# see the data
spam.head()
Out[2]:
In [3]:
#features against spam, spam=1, ham=0
import matplotlib.pyplot as plt
%matplotlib inline
#plot column number 12, versus ('ham or spam')
plt.scatter(spam.values[:,11], spam.values[:,-1]);
In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
# you may play with max_depth
dt = DecisionTreeClassifier(max_depth=2)
# you may play with depth and prune the tree in different levels
X = spam.values[:, :57]
y = spam.values[:, -1]
dt.fit(X,y)
spamnames = spam.columns.tolist()[:57]
In [5]:
dot_data = export_graphviz(dt, out_file=None,
feature_names=spamnames,
filled=True, rounded=True,
special_characters=True)
In [6]:
dot_data = export_graphviz(dt, out_file=None)
In [7]:
# you need to install graphviz-python from Anaconda,
# graphviz-python it is not installed by default.
import graphviz
In [8]:
graph = graphviz.Source(dot_data)
In [9]:
dot_data = export_graphviz(dt, out_file=None,
feature_names=spamnames,
class_names=['ham', 'spam'],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[9]:
In [10]:
from sklearn.model_selection import train_test_split
In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
In [12]:
dt10 = DecisionTreeClassifier(max_depth=10)
dt11 = DecisionTreeClassifier(max_depth=11)
dt12 = DecisionTreeClassifier(max_depth=12)
# you may play with depth and prune the tree in different levels
dt10.fit(X_train,y_train)
dt11.fit(X_train,y_train)
dt12.fit(X_train,y_train)
y10_pred = dt10.predict(X_test)
y11_pred = dt11.predict(X_test)
y12_pred = dt12.predict(X_test)
from sklearn.metrics import accuracy_score
In [13]:
accuracy_score(y_test, y10_pred)
Out[13]:
In [14]:
accuracy_score(y_test, y11_pred)
Out[14]:
In [15]:
accuracy_score(y_test, y12_pred)
Out[15]: