version 0.1, May 2016
This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]
Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers.
In [1]:
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/phishing.csv.zip', 'r') as z:
f = z.open('phishing.csv')
data = pd.read_csv(f, index_col=False)
In [2]:
data.head()
Out[2]:
In [4]:
data.phishing.value_counts()
Out[4]:
In [92]:
data.url[data.phishing==1].sample(50, random_state=1).tolist()
Out[92]:
Contain any of the following:
In [26]:
keywords = ['https', 'login', '.php', '.html', '@', 'sign']
In [31]:
for keyword in keywords:
data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)
In [35]:
data['lenght'] = data.url.str.len() - 2
In [38]:
domain = data.url.str.split('/', expand=True).iloc[:, 2]
In [41]:
data['lenght_domain'] = domain.str.len()
In [44]:
domain.head(12)
Out[44]:
In [67]:
data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)
In [68]:
data['count_com'] = data.url.str.count('com')
In [69]:
data.sample(15, random_state=4)
Out[69]:
In [70]:
X = data.drop(['url', 'phishing'], axis=1)
In [71]:
y = data.phishing
In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
In [73]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
In [74]:
cross_val_score(clf, X, y, cv=10)
Out[74]:
In [75]:
clf.fit(X, y)
Out[75]:
In [76]:
from sklearn.externals import joblib
In [79]:
joblib.dump(clf, '22_clf_rf.pkl', compress=3)
Out[79]:
In [131]:
from m22_model_deployment import predict_proba
In [132]:
predict_proba('http://www.vipturismolondres.com/com.br/?atendimento=Cliente&/LgSgkszm64/B8aNzHa8Aj.php')
Out[132]:
First we need to install some libraries
pip install flask-restplus
Load Flask
In [87]:
from flask import Flask
from flask.ext.restplus import Api
from flask.ext.restplus import fields
from sklearn.externals import joblib
from flask.ext.restplus import Resource
from sklearn.externals import joblib
import pandas as pd
Create api
In [128]:
app = Flask(__name__)
api = Api(
app,
version='1.0',
title='Phishing Prediction API',
description='Phishing Prediction API')
ns = api.namespace('predict',
description='Phishing Classifier')
parser = api.parser()
parser.add_argument(
'URL',
type=str,
required=True,
help='URL to be analyzed',
location='args')
resource_fields = api.model('Resource', {
'result': fields.String,
})
Load model and create function that predicts an URL
In [129]:
clf = joblib.load('22_clf_rf.pkl')
@ns.route('/')
class PhishingApi(Resource):
@api.doc(parser=parser)
@api.marshal_with(resource_fields)
def get(self):
args = parser.parse_args()
result = self.predict_proba(args)
return result, 200
def predict_proba(self, args):
url = args['URL']
url_ = pd.DataFrame([url], columns=['url'])
# Create features
keywords = ['https', 'login', '.php', '.html', '@', 'sign']
for keyword in keywords:
url_['keyword_' + keyword] = url_.url.str.contains(keyword).astype(int)
url_['lenght'] = url_.url.str.len() - 2
domain = url_.url.str.split('/', expand=True).iloc[:, 2]
url_['lenght_domain'] = domain.str.len()
url_['isIP'] = (url_.url.str.replace('.', '') * 1).str.isnumeric().astype(int)
url_['count_com'] = url_.url.str.count('com')
# Make prediction
p1 = clf.predict_proba(url_.drop('url', axis=1))[0,1]
print('url=', url,'| p1=', p1)
return {
"result": p1
}
Run API
In [ ]:
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)