Feature Selection

Use chi-squared tests to determine best features for classification of malware samples.



In [1]:

    
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [ ]:



In [ ]:

3. Graphs



In [17]:

    
train_data = pd.read_csv('data/sorted-entropy-features-apt.csv')
train_labels = pd.read_csv('data/sorted-train-labels-apt.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]



In [4]:

    
train_labels.head()









    Out[4]:






  
    
      
      file_name
      malware_type_x
      sample_label
      family_name
      family_label
    
  
  
    
      0
      001dd76872d80801692ff942308c64e6
      Trojan:Win32/Sluegot.D
      5992
      Win32.Trojan.Sluegot
      2057
    
    
      1
      002325a0a67fded0381b5648d7fe9b8e
      Trojan:Win32/Sluegot.C
      5993
      Win32.Trojan.Sluegot
      2057
    
    
      2
      00dbb9e1c09dbdafb360f3163ba5a3de
      Backdoor:Win32/Stradatu
      6005
      Win32.Backdoor.Stradatu
      2064
    
    
      3
      0149b7bd7218aab4e257d28469fddb0d
      Trojan:Win32/Sluegot.A
      6017
      Win32.Trojan.Sluegot
      2057
    
    
      4
      01e0dc079d4e33d8edd050c4900818da
      Backdoor:Win32/Stradatu
      6005
      Win32.Backdoor.Stradatu
      2064



In [18]:

    
y.head()









    Out[18]:





0    2057
1    2057
2    2064
3    2057
4    2064
Name: family_label, dtype: int64



In [19]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
color = [ item/255 for item in ya]
plt.scatter(xa,xb,c=ya,cmap='brg')









    Out[19]:





<matplotlib.collections.PathCollection at 0x7f2b8a0a70d0>



In [20]:

    
train_data = pd.read_csv('data/sorted-entropy-features-vs251.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()









    Out[20]:





0     4
1     0
2     6
3     9
4    10
Name: family_label, dtype: int64



In [21]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')









    Out[21]:





<matplotlib.collections.PathCollection at 0x7f2b83f86790>



In [22]:

    
train_data = pd.read_csv('data/sorted-entropy-features-vs252.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()









    Out[22]:





0    1
1    2
2    3
3    5
4    7
Name: family_label, dtype: int64



In [23]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')









    Out[23]:





<matplotlib.collections.PathCollection at 0x7f2b89ff3250>



In [24]:

    
train_data = pd.read_csv('data/sorted-entropy-features-vs263.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()









    Out[24]:





0    23
1    29
2    93
3    29
4     0
Name: family_label, dtype: int64



In [25]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')









    Out[25]:





<matplotlib.collections.PathCollection at 0x7f2b83869150>



In [26]:

    
train_data = pd.read_csv('data/sorted-entropy-features-vs264.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()









    Out[26]:





0       4
1    2055
2      28
3       4
4      28
Name: family_label, dtype: int64



In [27]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')









    Out[27]:





<matplotlib.collections.PathCollection at 0x7f2b837a1b10>



In [29]:

    
vcounts = y.value_counts()



In [33]:

    
plt.figure(figsize=(15,15))
plt.xlabel("Family Label")
plt.ylabel("Label Count")
plt.scatter(vcounts.index, vcounts,c=vcounts.index,cmap='brg')









    Out[33]:





<matplotlib.collections.PathCollection at 0x7f2b836e3090>



In [ ]:



In [ ]:



In [ ]:



In [1]:

    
import urllib2
from BeautifulSoup import BeautifulSoup

page = urllib2.urlopen('http://www.google.com/')
soup = BeautifulSoup(page)



In [13]:

    
page = urllib2.urlopen('http://www.prehnite.com.au/')
soup = BeautifulSoup(page)
x = soup.body.findAll('img')



In [17]:

    
x = soup.body.findAll('img')



In [20]:

    
for line in page:
    if "<img" in line:
        print("img tag -> {:s}".format(line))



In [24]:

    
htmllines = page.read()



In [43]:

    
import urllib2
import re
response = urllib2.urlopen('http://adskeeper.co.uk/ad10300.html?site_id=1187204')
#print "Response:", response

# Get the URL. This gets the real URL. 
#print "The URL is: ", response.geturl()

# Getting the code
#print "This gets the code: ", response.code

# Get the Headers. 
# This returns a dictionary-like object that describes the page fetched, 
# particularly the headers sent by the server
#print "The Headers are: ", response.info()

# Get the date part of the header
#print "The Date is: ", response.info()['date']

# Get the server part of the header
#print "The Server is: ", response.info()['server']

# Get all data
html = response.read()
#print "Get all data: ", html
lines = html.split('\n')
# Get only the length
#print "Get the length :", len(html)

purl = re.compile('.+src="(.+\.jpg)".*')
pjpg = re.compile('.+/(\w+\.jpg)".*')
    
# Showing that the file object is iterable
for line in lines:
    if "img" in line:
        print line.rstrip()
        jpgurl = purl.match(line)
        jpgname = pjpg.match(line)
        if jpgurl != None:
            image_url = jpgurl.group(1)
            print image_url
            
        if jpgname != None:
            image_name = jpgname.group(1)
            print image_name
            jpgresponse = urllib2.urlopen(image_url)
            jpgdata = jpgresponse.read()
            of = open('data/'+image_name, 'wb')
            of.write(jpgdata)
            of.close()









    



        .logo img{
        .logo img:focus{



In [42]:

    
purl = re.compile('.+src="(.+\.jpg)".*')
pjpg = re.compile('.+/(\w+\.jpg)".*')
    
# Showing that the file object is iterable
for line in lines:
    if "img" in line:
        print line.rstrip()
        jpgurl = purl.match(line)
        jpgname = pjpg.match(line)
        if jpgurl != None:
            image_url = jpgurl.group(1)
            print image_url
            
        if jpgname != None:
            image_name = jpgname.group(1)
            print image_name
            jpgresponse = urllib2.urlopen("http://www.prehnite.com.au/"+image_url)
            jpgdata = jpgresponse.read()
            of = open('data/'+image_name, 'wb')
            of.write(jpgdata)
            of.close()









    



                                <p><br /><br /><img class=specimendisplay src="images/pf00064.JPG"><br /><br /></p>
images/pf00064.JPG
pf00064.JPG



In [ ]:



In [ ]:

    
print html



In [ ]:

    
help(soup)

	file_name	malware_type_x	sample_label	family_name	family_label
0	001dd76872d80801692ff942308c64e6	Trojan:Win32/Sluegot.D	5992	Win32.Trojan.Sluegot	2057
1	002325a0a67fded0381b5648d7fe9b8e	Trojan:Win32/Sluegot.C	5993	Win32.Trojan.Sluegot	2057
2	00dbb9e1c09dbdafb360f3163ba5a3de	Backdoor:Win32/Stradatu	6005	Win32.Backdoor.Stradatu	2064
3	0149b7bd7218aab4e257d28469fddb0d	Trojan:Win32/Sluegot.A	6017	Win32.Trojan.Sluegot	2057
4	01e0dc079d4e33d8edd050c4900818da	Backdoor:Win32/Stradatu	6005	Win32.Backdoor.Stradatu	2064