by Alejandro Correa Bahnsen & Iván Torroledo
version 1.2, Feb 2018
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers.
In [1]:
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/model_deployment/phishing.csv.zip', 'r') as z:
f = z.open('phishing.csv')
data = pd.read_csv(f, index_col=False)
In [2]:
data.head()
Out[2]:
In [3]:
data.tail()
Out[3]:
In [4]:
data.phishing.value_counts()
Out[4]:
In [5]:
data.url[data.phishing==1].sample(50, random_state=1).tolist()
Out[5]:
Contain any of the following:
In [6]:
keywords = ['https', 'login', '.php', '.html', '@', 'sign']
In [7]:
for keyword in keywords:
data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)
In [8]:
data['lenght'] = data.url.str.len() - 2
In [9]:
domain = data.url.str.split('/', expand=True).iloc[:, 2]
In [10]:
data['lenght_domain'] = domain.str.len()
In [11]:
domain.head(12)
Out[11]:
In [12]:
data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)
In [13]:
data['count_com'] = data.url.str.count('com')
In [14]:
data.sample(15, random_state=4)
Out[14]:
In [15]:
X = data.drop(['url', 'phishing'], axis=1)
In [16]:
y = data.phishing
In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
In [18]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
In [19]:
cross_val_score(clf, X, y, cv=10)
Out[19]:
In [20]:
clf.fit(X, y)
Out[20]:
In [21]:
from sklearn.externals import joblib
In [22]:
joblib.dump(clf, '../datasets/model_deployment/07_phishing_clf.pkl', compress=3)
Out[22]:
In [23]:
from m07_model_deployment import predict_proba
In [24]:
predict_proba('http://www.vipturismolondres.com/com.br/?atendimento=Cliente&/LgSgkszm64/B8aNzHa8Aj.php')
Out[24]:
First we need to install some libraries
pip install flask-restplus
Load Flask
In [25]:
from flask import Flask
from flask_restplus import Api, Resource, fields
from sklearn.externals import joblib
import pandas as pd
Create api
In [26]:
app = Flask(__name__)
api = Api(
app,
version='1.0',
title='Phishing Prediction API',
description='Phishing Prediction API')
ns = api.namespace('predict',
description='Phishing Classifier')
parser = api.parser()
parser.add_argument(
'URL',
type=str,
required=True,
help='URL to be analyzed',
location='args')
resource_fields = api.model('Resource', {
'result': fields.String,
})
Load model and create function that predicts an URL
In [27]:
clf = joblib.load('../datasets/model_deployment/07_phishing_clf.pkl')
@ns.route('/')
class PhishingApi(Resource):
@api.doc(parser=parser)
@api.marshal_with(resource_fields)
def get(self):
args = parser.parse_args()
result = self.predict_proba(args)
return result, 200
def predict_proba(self, args):
url = args['URL']
url_ = pd.DataFrame([url], columns=['url'])
# Create features
keywords = ['https', 'login', '.php', '.html', '@', 'sign']
for keyword in keywords:
url_['keyword_' + keyword] = url_.url.str.contains(keyword).astype(int)
url_['lenght'] = url_.url.str.len() - 2
domain = url_.url.str.split('/', expand=True).iloc[:, 2]
url_['lenght_domain'] = domain.str.len()
url_['isIP'] = (url_.url.str.replace('.', '') * 1).str.isnumeric().astype(int)
url_['count_com'] = url_.url.str.count('com')
# Make prediction
p1 = clf.predict_proba(url_.drop('url', axis=1))[0,1]
print('url=', url,'| p1=', p1)
return {
"result": p1
}
Run API
In [28]:
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)