This notebook takes the preprocessed data as input and starts to build learning models on it, while evaluating each variable as features
In [1]:
%pylab --no-import-all inline
%matplotlib inline
In [2]:
import pandas as pd
import numpy as npm
## Load the preprocessed file
allDf = pd.read_csv('../data/vcdb_fully_processed.csv')
In [3]:
allDf.head()
Out[3]:
In [4]:
"""
Get all the columns for the dataframe
"""
for i, col in enumerate(allDf.columns):
print i, col, allDf[col].dtype
In [5]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# setting the palette
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (10, 10)})
for i, col in enumerate(allDf.columns):
## type check the column
if (allDf[col].dtype == 'int64'):
print i, col
f = plt.figure(figsize=(10, 10))
plt.hist(allDf[col].values)
sns.axlabel(col, "values")
Histograms
In [6]:
"""
Instead of calculating joint probability distribution taking
all possible combinations of features taken 2 at a time, I chose
to calculate distribution for some key columns
KeyCols :
- actor.external int64
- actor.internal int64
- actor.partner int64
- actor.unknown int64
- action.malware int64
- action.hacking int64
- action.social int64
- action.misuse int64
- action.physical int64
- action.error int64
- action.environmental int64
- action.unknown int64
- timeline.discovery.day_count float64
- timeline.incident.day float64
- timeline.incident.month float64
- timeline.incident.year int64
"""
keyCols = [
'actor.external', # int64
'actor.internal', # int64
'actor.partner', # int64
'actor.unknown', # int64
'action.malware', # int64
'action.hacking', # int64
'action.social', # int64
'action.misuse', # int64
'action.physical', # int64
'action.error', # int64
'action.environmental', # int64
'action.unknown', # int64'
# 'timeline.discovery.day_count', #float64
# 'timeline.incident.day', #float64
# 'timeline.incident.month', # float64
'timeline.incident.year' #int64
]
In [7]:
for i, col in enumerate(keyCols):
print i, col
f = plt.figure(figsize=(10, 10))
plt.hist(allDf[col].values)
sns.axlabel(col, "values")
In [8]:
for i, col in enumerate(keyCols):
print i, col
f = plt.figure(figsize=(10, 10))
sns.distplot(allDf[col].values, rug=True)
sns.axlabel(col, "values")
In [15]:
"""
To get 2 features at a time
"""
from itertools import combinations
In [16]:
for col1, col2 in combinations(keyCols, r=2):
f = plt.figure(figsize=(10, 10))
plt.hexbin(allDf[col1], allDf[col2], gridsize=100, cmap="BuGn")
plt.hexbin(allDf[col1], allDf[col2], gridsize=100, cmap="BuGn")
sns.axlabel(col1, col2)
In [17]:
for col1, col2 in combinations(keyCols, r=2):
print col1, col2
f = plt.figure(figsize=(10, 10))
sns.kdeplot(allDf[col1], allDf[col2], shade=True)
sns.axlabel(col1, col2)
In [18]:
"""
Combined Bivariate and Univariate Plots
@Ashley: review the bug
"""
for col1, col2 in combinations(keyCols, r=2):
print col1, col2
jointDf = pd.concat([allDf[col1], allDf[col2]])
f = plt.figure(figsize=(10, 10))
sns.jointplot(col1, col2, jointDf.values, kind="kde")
sns.axlabel(col1, col2)
In [9]:
## remove year from keyCols
keyCols.remove('timeline.incident.year')
sns.boxplot(allDf[keyCols], names=keyCols, whis=np.inf, color="PaleGreen")
Out[9]:
In [10]:
sns.violinplot(allDf[keyCols], names=keyCols, color="pastel")
Out[10]:
In [13]:
sns.corrplot(allDf[keyCols], names=keyCols, cmap="RdBu_r")
Out[13]:
In [11]:
numericCols = list()
for col in allDf.columns:
if allDf[col].dtype == 'int64':
numericCols.append(col)
In [12]:
sns.corrplot(allDf[numericCols])
Out[12]:
In [ ]:
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
In [ ]:
from time import time
(sample_size, features) = allDf[keyCols].shape
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
In [ ]:
data = allDf[keyCols].values
bench_k_means(KMeans(init='k-means++', n_clusters=10, n_init=10),name="k-means++", data=data)
In [ ]: