In [2]:
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.mlab as mlab
    
In [3]:
    
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
pentropy.head()
    
    Out[3]:
In [4]:
    
pfileid.head()
    
    Out[4]:
In [5]:
    
plabels.head()
    
    Out[5]:
In [8]:
    
ftypes = pfileid['file_type']
pdf_files = []
for idx, ftype in enumerate(ftypes):
    if 'Port' in ftype:
        pdf_files.append(pfileid.iloc[idx, 0])
        
print("Found {:d} PDF files.".format(len(pdf_files)))
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [2]:
    
pfeat = pd.read_csv('data/3589-pdf-features-legit.csv')
pfeat.head()
    
    Out[2]:
In [3]:
    
pfeat.shape
    
    Out[3]:
In [ ]:
    
plabel = pd.read_csv('data/no-labels-yet.csv')
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [5]:
    
X = pfeat.iloc[:,1:]
y = np.random.randint(0, 4, 119)
    
In [ ]:
    
# find the top 10 percent variance features, from 1006 -> 101 features
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
X_new_10.shape
    
In [6]:
    
y
    
    Out[6]:
In [7]:
    
plt.hist(y, 11, normed=True)
plt.show()
    
    
In [ ]:
    
    
In [ ]: