by Alejandro Correa Bahnsen & Iván Torroledo
version 1.4, February 2019
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers.
In [2]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/phishing.csv')
In [3]:
data.head()
Out[3]:
In [4]:
data.tail()
Out[4]:
In [5]:
data.phishing.value_counts()
Out[5]:
In [6]:
data.url[data.phishing==1].sample(50, random_state=1).tolist()
Out[6]:
Contain any of the following:
In [7]:
keywords = ['https', 'login', '.php', '.html', '@', 'sign']
In [8]:
for keyword in keywords:
data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)
In [9]:
data['lenght'] = data.url.str.len() - 2
In [10]:
domain = data.url.str.split('/', expand=True).iloc[:, 2]
In [11]:
data['lenght_domain'] = domain.str.len()
In [12]:
domain.head(12)
Out[12]:
In [13]:
data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)
In [14]:
data['count_com'] = data.url.str.count('com')
In [15]:
data.sample(15, random_state=4)
Out[15]:
In [16]:
X = data.drop(['url', 'phishing'], axis=1)
In [17]:
y = data.phishing
In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
In [19]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=3)
In [20]:
cross_val_score(clf, X, y, cv=10)
Out[20]:
In [21]:
clf.fit(X, y)
Out[21]:
In [22]:
from sklearn.externals import joblib
In [23]:
joblib.dump(clf, 'model_deployment/phishing_clf.pkl', compress=3)
Out[23]:
In [1]:
from model_deployment.m09_model_deployment import predict_proba
In [2]:
predict_proba('http://www.vipturismolondres.com/com.br/?atendimento=Cliente&/LgSgkszm64/B8aNzHa8Aj.php')
Out[2]:
First we need to install some libraries
pip install flask-restplus
Load Flask
In [4]:
from flask import Flask
from flask_restplus import Api, Resource, fields
from sklearn.externals import joblib
Create api
In [5]:
app = Flask(__name__)
api = Api(
app,
version='1.0',
title='Phishing Prediction API',
description='Phishing Prediction API')
ns = api.namespace('predict',
description='Phishing Classifier')
parser = api.parser()
parser.add_argument(
'URL',
type=str,
required=True,
help='URL to be analyzed',
location='args')
resource_fields = api.model('Resource', {
'result': fields.String,
})
Load model and create function that predicts an URL
In [6]:
from model_deployment.m09_model_deployment import predict_proba
In [7]:
@ns.route('/')
class PhishingApi(Resource):
@api.doc(parser=parser)
@api.marshal_with(resource_fields)
def get(self):
args = parser.parse_args()
return {
"result": predict_proba(args['URL'])
}, 200
Run API
In [ ]:
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)