In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.mlab as mlab
In [3]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
pentropy.head()
Out[3]:
In [4]:
pfileid.head()
Out[4]:
In [5]:
plabels.head()
Out[5]:
In [8]:
ftypes = pfileid['file_type']
pdf_files = []
for idx, ftype in enumerate(ftypes):
if 'Port' in ftype:
pdf_files.append(pfileid.iloc[idx, 0])
print("Found {:d} PDF files.".format(len(pdf_files)))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [2]:
pfeat = pd.read_csv('data/3589-pdf-features-legit.csv')
pfeat.head()
Out[2]:
In [3]:
pfeat.shape
Out[3]:
In [ ]:
plabel = pd.read_csv('data/no-labels-yet.csv')
In [ ]:
In [ ]:
In [ ]:
In [5]:
X = pfeat.iloc[:,1:]
y = np.random.randint(0, 4, 119)
In [ ]:
# find the top 10 percent variance features, from 1006 -> 101 features
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
X_new_10.shape
In [6]:
y
Out[6]:
In [7]:
plt.hist(y, 11, normed=True)
plt.show()
In [ ]:
In [ ]: