In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_extraction import DictVectorizer
import os
import pandas as pd
import time
import scipy
from scipy import stats
import sys
In [18]:
# dataDir = '/Users/Gabriel/Dropbox/Research/DICE/Anomaly Detection/Usecases/POSIDONIA'
# data = os.path.join(dataDir, 'cep_man_labeled.csv')
dataDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
data = os.path.join(dataDir, 'CEP_Complete_Labeled_Extended.csv')
In [19]:
df = pd.read_csv(data)
df.set_index('key', inplace=True)
dropList = ['host']
print "Droped columns are: %s" %dropList
df = df.drop(dropList, axis=1)
print "Index Name: %s" %df.index.name
In [20]:
print "Dataframe shape (row, col): %s" %str(df.shape)
In [21]:
#encode dataframe
col = []
for el, v in df.dtypes.iteritems():
# print el
if v == 'object':
col.append(el)
col
from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
df, t, v = ohEncoding(df, col, replace=True)
print df.shape
In [22]:
features = df.columns[:-1]
print "Detected Features are: %s" %features
X = df[features]
# Target always last column of dataframe
y = df.iloc[:,-1].values
print y
In [24]:
rfc = RandomForestClassifier(n_jobs=-1, max_features='sqrt', n_estimators=50, oob_score=True)
# if isinstance(rfc, RandomForestClassifier):
# print "test"
param_grid = {
'n_estimators': [200, 700],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [5, 15, 25]
}
start_time_g = time.time()
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X, y)
print CV_rfc.best_params_
bestParam = CV_rfc.best_params_
print "Best params for Random Forest: %s" % str(bestParam)
elapsed_time_g = time.time() - start_time_g
print "Grid Search for Random Forest took: %s" %str(elapsed_time_g)
In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# fix random seed for reproducibility
seed = 7
start_time = time.time()
clfKV = RandomForestClassifier(max_depth=bestParam['max_depth'], n_estimators=bestParam['n_estimators'], max_features=bestParam['max_features'], n_jobs=-1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(clfKV, X, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
elapsed_time = time.time() - start_time
print "Cross validation took: %s" %str(elapsed_time)
In [29]:
start_time = time.time()
clf = RandomForestClassifier(max_depth=bestParam['max_depth'], n_estimators=bestParam['n_estimators'], max_features=bestParam['max_features'], n_jobs=-1)
clf.fit(X, y)
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
predict = clf.predict(X)
# print predict
# View the predicted probabilities of the first 10 observations
predProb = clf.predict_proba(X)
# print predProb
score = clf.score(X, y)
print "Training Score Random Forest: %s" %score
# # Create confusion matrix
# print pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
#
# View a list of the features and their importance scores
expDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/experiments'
fimp = list(zip(X, clf.feature_importances_))
print "Feature importance Random Forest Training: "
print fimp
elapsed_time = time.time() - start_time
print "Training Random Forest Took: %s" % str(elapsed_time)
dfimp = dict(fimp)
dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance'])
sdfimp = dfimp.sort('Importance', ascending=False)
dfimpCsv = 'Feature_Importance_RF_%s.csv' % 'CEP'
sdfimp.to_csv(os.path.join(expDir, dfimpCsv))
In [16]:
dt_g = DecisionTreeClassifier()
param_grid_dt = {
'criterion': ['gini', 'random'],
'splitter':['best', 'random'],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [5, 15, 25, 50, 100],
'min_sample_split': [2, 5, 10]
}
start_time_d = time.time()
CV_dt = GridSearchCV(estimator=dt_g, param_grid=param_grid_dt, cv=5)
CV_dt.fit(X, y)
print CV_dt.best_params_
bestParam_dt = CV_dt.best_params_
print "Best params for Decision Tree: %s" % str(bestParam_dt)
elapsed_time_dt = time.time() - start_time_d
print "Grid Search for Decision Tree took: %s" %str(elapsed_time_dt)
In [ ]:
dt = DecisionTreeClassifier(criterion=settings["criterion"], splitter=settings["splitter"],
max_features=max_features, max_depth=max_depth,
min_samples_split=float(settings["min_sample_split"]),
min_weight_fraction_leaf=float(settings["min_weight_faction_leaf"]), random_state=settings["random_state"])
dt.fit(X, y)
predict = dt.predict(X)
print "Prediction for Decision Tree Training:"
print predict
predProb = dt.predict_proba(X)
print "Prediction probabilities for Decision Tree Training:"
print predProb
score = dt.score(X, y)
print "Decision Tree Training Score: %s" % str(score)
fimp = list(zip(X, dt.feature_importances_))
print "Feature importance Random Forest Training: "
print fimp
dfimp = dict(fimp)
dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance'])
sdfimp = dfimp.sort('Importance', ascending=False)
dfimpCsv = 'Feature_Importance_%s.csv' % mname
sdfimp.to_csv(os.path.join(self.modelDir, dfimpCsv))
In [45]:
dataDir2 = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
cep = os.path.join(dataDir, 'cep.csv')
dfCep = pd.read_csv(cep)
dfCep = dfCep.drop(dropList, axis=1)
dfCep.set_index('key', inplace=True)
#encode dataframe
col = []
for el, v in dfCep.dtypes.iteritems():
# print el
if v == 'object':
col.append(el)
col
from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
dfCep, t, v = ohEncoding(dfCep, col, replace=True)
dfCep
Out[45]:
In [47]:
cep_predict = clf.predict(dfCep)
cep_predict
# print predict
# View the predicted probabilities of the first 10 observations
# print predProb
Out[47]:
In [48]:
#attach to dataframe labels
dfCep['Target'] = cep_predict
dfCep.to_csv(os.path.join(expDir, 'CEP_Anomaly_exp.csv'))
#dump serialized model
import cPickle as pickle
fname = os.path.join(expDir, 'CEP_Model_EXP')
pickle.dump(clf, open(fname, "wb"))
In [ ]: