In [1]:
## This file provides starter code for extracting features from the xml files and
## for doing some learning.
##
## The basic set-up:
## ----------------
## main() will run code to extract features, learn, and make predictions.
##
## extract_feats() is called by main(), and it will iterate through the
## train/test directories and parse each xml file into an xml.etree.ElementTree,
## which is a standard python object used to represent an xml file in memory.
## (More information about xml.etree.ElementTree objects can be found here:
## http://docs.python.org/2/library/xml.etree.elementtree.html
## and here: http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/)
## It will then use a series of "feature-functions" that you will write/modify
## in order to extract dictionaries of features from each ElementTree object.
## Finally, it will produce an N x D sparse design matrix containing the union
## of the features contained in the dictionaries produced by your "feature-functions."
## This matrix can then be plugged into your learning algorithm.
##
## The learning and prediction parts of main() are largely left to you, though
## it does contain code that randomly picks class-specific weights and predicts
## the class with the weights that give the highest score. If your prediction
## algorithm involves class-specific weights, you should, of course, learn
## these class-specific weights in a more intelligent way.
##
## Feature-functions:
## --------------------
## "feature-functions" are functions that take an ElementTree object representing
## an xml file (which contains, among other things, the sequence of system calls a
## piece of potential malware has made), and returns a dictionary mapping feature names to
## their respective numeric values.
## For instance, a simple feature-function might map a system call history to the
## dictionary {'first_call-load_image': 1}. This is a boolean feature indicating
## whether the first system call made by the executable was 'load_image'.
## Real-valued or count-based features can of course also be defined in this way.
## Because this feature-function will be run over ElementTree objects for each
## software execution history instance, we will have the (different)
## feature values of this feature for each history, and these values will make up
## one of the columns in our final design matrix.
## Of course, multiple features can be defined within a single dictionary, and in
## the end all the dictionaries returned by feature functions (for a particular
## training example) will be unioned, so we can collect all the feature values
## associated with that particular instance.
##
## Two example feature-functions, first_last_system_call_feats() and
## system_call_count_feats(), are defined below.
## The first of these functions indicates what the first and last system-calls
## made by an executable are, and the second records the total number of system
## calls made by an executable.
##
## What you need to do:
## --------------------
## 1. Write new feature-functions (or modify the example feature-functions) to
## extract useful features for this prediction task.
## 2. Implement an algorithm to learn from the design matrix produced, and to
## make predictions on unseen data. Naive code for these two steps is provided
## below, and marked by TODOs.
##
## Computational Caveat
## --------------------
## Because the biggest of any of the xml files is only around 35MB, the code below
## will parse an entire xml file and store it in memory, compute features, and
## then get rid of it before parsing the next one. Storing the biggest of the files
## in memory should require at most 200MB or so, which should be no problem for
## reasonably modern laptops. If this is too much, however, you can lower the
## memory requirement by using ElementTree.iterparse(), which does parsing in
## a streaming way. See http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/
## for an example.
import os
from collections import Counter
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import numpy as np
# used for data manipulation
from scipy import sparse
from scipy import stats
import json
# used to create bag of words feature
import sklearn.feature_extraction
# used for dataframe manipulation
import pandas as pd
import util
In [2]:
def extract_feats_single_file(ffs, direc="train", virus=None):
'''
arguments:
ffs are a list of feature-functions.
direct is a directory containing xml files (expected to be train or test).
returns:
a dictionary with the features for the single virus file. The file is selected
randomly if virus is None, otherwise the specified virus type is used
'''
for datafile in os.listdir(direc):
# extract id and true class (if available) from filename
id_str,clazz = datafile.split('.')[:2]
if virus is None or virus == clazz:
break
tree = ET.parse(os.path.join(direc,datafile))
return [ff(tree) for ff in ffs]
In [3]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
"""
arguments:
ffs are a list of feature-functions.
direc is a directory containing xml files (expected to be train or test).
global_feat_dict is a dictionary mapping feature_names to column-numbers; it
should only be provided when extracting features from test data, so that
the columns of the test matrix align correctly.
returns:
a sparse design matrix, a dict mapping features to column-numbers,
a vector of target classes, and a list of system-call-history ids in order
of their rows in the design matrix.
Note: the vector of target classes returned will contain the true indices of the
target classes on the training data, but will contain only -1's on the test
data
"""
fds = [] # list of feature dicts
classes = []
ids = []
for datafile in os.listdir(direc):
# extract id and true class (if available) from filename
id_str,clazz = datafile.split('.')[:2]
ids.append(id_str)
# add target class if this is training data
try:
classes.append(util.malware_classes.index(clazz))
except ValueError:
# we should only fail to find the label in our list of malware classes
# if this is test data, which always has an "X" label
assert clazz == "X"
classes.append(-1)
rowfd = {}
# parse file as an xml document
tree = ET.parse(os.path.join(direc,datafile))
# accumulate features
[rowfd.update(ff(tree)) for ff in ffs]
fds.append(rowfd)
X,feat_dict = make_design_mat(fds,global_feat_dict)
return X, feat_dict, np.array(classes), ids
In [4]:
def make_design_mat(fds, global_feat_dict=None):
"""
arguments:
fds is a list of feature dicts (one for each row).
global_feat_dict is a dictionary mapping feature_names to column-numbers; it
should only be provided when extracting features from test data, so that
the columns of the test matrix align correctly.
returns:
a sparse NxD design matrix, where N == len(fds) and D is the number of
the union of features defined in any of the fds
"""
if global_feat_dict is None:
all_feats = set()
[all_feats.update(fd.keys()) for fd in fds]
feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
else:
feat_dict = global_feat_dict
cols = []
rows = []
data = []
for i in xrange(len(fds)):
temp_cols = []
temp_data = []
for feat,val in fds[i].iteritems():
try:
# update temp_cols iff update temp_data
temp_cols.append(feat_dict[feat])
temp_data.append(val)
except KeyError as ex:
if global_feat_dict is not None:
pass # new feature in test data; nbd
else:
raise ex
# all fd's features in the same row
k = len(temp_cols)
cols.extend(temp_cols)
data.extend(temp_data)
rows.extend([i]*k)
assert len(cols) == len(rows) and len(rows) == len(data)
X = sparse.csr_matrix((np.array(data),
(np.array(rows), np.array(cols))),
shape=(len(fds), len(feat_dict)))
return X, feat_dict
In [5]:
## Here are two example feature-functions. They each take an xml.etree.ElementTree object,
# (i.e., the result of parsing an xml file) and returns a dictionary mapping
# feature-names to numeric values.
def first_last_system_call_feats(tree):
"""
arguments:
tree is an xml.etree.ElementTree object
returns:
a dictionary mapping 'first_call-x' to 1 if x was the first system call
made, and 'last_call-y' to 1 if y was the last system call made.
(in other words, it returns a dictionary indicating what the first and
last system calls made by an executable were.)
"""
c = Counter()
in_all_section = False
first = True # is this the first system call
last_call = None # keep track of last call we've seen
for el in tree.iter():
# ignore everything outside the "all_section" element
if el.tag == "all_section" and not in_all_section:
in_all_section = True
elif el.tag == "all_section" and in_all_section:
in_all_section = False
elif in_all_section:
if first:
c["first_call-"+el.tag] = 1
first = False
last_call = el.tag # update last call seen
# finally, mark last call seen
c["last_call-"+last_call] = 1
return c
In [6]:
def system_call_count_feats(tree):
"""
arguments:
tree is an xml.etree.ElementTree object
returns:
a dictionary mapping 'num_system_calls' to the number of system_calls
made by an executable (summed over all processes)
"""
c = Counter()
in_all_section = False
for el in tree.iter():
# ignore everything outside the "all_section" element
if el.tag == "all_section" and not in_all_section:
in_all_section = True
elif el.tag == "all_section" and in_all_section:
in_all_section = False
elif in_all_section:
c['num_system_calls'] += 1
return c
In [7]:
def tag_counts_feats(tree):
"""
arguments:
tree is an xml.etree.ElementTree object
returns:
a dictionary mapping 'tag' to the number of times 'tag' appears in the .xml file
"""
c = Counter()
for el in tree.iter():
# count the tags as you see them
c[el.tag + "_count"] += 1
return c
In [172]:
def two_sequences_feat(tree):
'''
Read sequences of length 2 from a tree.
'''
c = Counter()
previous_el_tag = ''
first_el = True
for el in tree.iter():
first_el = False
if (not first_el):
c[el.tag + '+' + previous_el_tag] += 1
previous_el_tag = el.tag
return c
In [207]:
def lcs_feat(tree):
c = Counter()
list = []
for el in tree.iter():
list.append(el.tag)
return list
In [221]:
def get_tag_list_of_class(clazz):
list_of_lists = []
for datafile in os.listdir("train"):
id_str,claz = datafile.split('.')[:2]
if (claz == clazz):
tree = ET.parse(os.path.join("train", datafile))
list = []
for el in tree.iter():
list.append(el.tag)
list_of_lists.append(list)
return list_of_lists
In [224]:
Out[224]:
In [205]:
import os
for i in xrange()
os.path.commonprefix(x)
Out[205]:
In [8]:
extract_feats_single_file([tag_counts_feats])
Out[8]:
In [133]:
def bag_of_words_feat(tree, frequency=False, count=True, subset=10):
"""
arguments: same as always boys
frequency = returns a dictionary with frequencies if True
count = returns a dictionary with counts if True
subset = returns only the :subset: most frequent words and least frequent words
returns:
a dictionary mapping 'word' to the frequency of that word in the file
"""
# we want to extract words using regular expressions
import re
root = tree.getroot()
text = ET.tostring(root)
split_text = re.findall(r"[\w']+", text)
cv = sklearn.feature_extraction.text.CountVectorizer(split_text)
cv.fit_transform(split_text)
# count words in counter (positive and negative)
vocab = Counter({key: cv.vocabulary_[key] for key in cv.vocabulary_.keys() if cv.vocabulary_[key]}) if count else {}
vocabNeg = Counter({key: -value for key,value in vocab.items()})
# subset accordingly
vocabMost = {key + "_most" : value for key,value in vocab.most_common(subset)} if subset > 0 else vocab
vocabLeast = {key + "_least": -value for key,value in vocabNeg.most_common(subset)} if subset > 0 else vocab
# count frequencies
totalMFreq = sum(vocabMost.values())
totalLFreq = sum(vocabLeast.values())
# frequency dictionaries
freq_dict = {key + "_freq" :(value/totalMFreq) for key,value in vocabMost.items()} if frequency else {}
least_freq_dict = {key + "_freq" : (value/totalLFreq) for key, value in vocabLeast.items()} if frequency else {}
return dict(vocabMost.items() + vocabLeast.items() + freq_dict.items() + least_freq_dict.items())
In [136]:
r = extract_feats_single_file([bag_of_words_feat])
In [8]:
def longest_sequence_feat(tree):
'''
For each system call, we extract the number of times it was called sequentially in the maximum number of sequential calls.
'''
currentMax = Counter()
actualMax = Counter()
prev_tag = None
in_all_section = False
for el in tree.iter():
# ignore everything outside the "all_section" element because it's not a system call
if el.tag == "all_section":
in_all_section = True
elif in_all_section and el.tag == "all_section":
in_all_section = False
if in_all_section:
# we're in a sequence so keep counting for this tag
if el.tag == prev_tag:
currentMax[el.tag] += 1
# we're encoutered a new tag, so reset prev_tag counter and update el.tag
else:
# before we reset, check to see if the actual value for prev_tag has improved
if currentMax[prev_tag] > actualMax[prev_tag] and prev_tag is not None:
actualMax[prev_tag] = currentMax[prev_tag]
currentMax[prev_tag] = 0
prev_tag = el.tag
currentMax[el.tag] = 1
return actualMax
In [12]:
extract_feats_single_file([longest_sequence_feat])
Out[12]:
In [13]:
def extract_url_connections(tree):
'''
For each system call which is 'open_url' we extract the server attribute
'''
keywords = ["server", "url", "open"]
c = Counter()
in_all_section = False
for el in tree.iter():
if el.tag == "all_section":
in_all_section = True
elif in_all_section and el.tag == "all_section":
in_all_section = False
# ignore everything outside the ''all_section'' element because it's not a system call
if in_all_section:
for key,value in el.attrib.iteritems():
if reduce(lambda a,c: a or c, [(keyword in key or keyword in value) for keyword in keywords], False):
c[key + "-" + value] += 1
return c
In [14]:
extract_feats_single_file([extract_url_connections])
Out[14]:
In [15]:
def two_sequences_feat(tree):
'''
Read sequences of length 2 from a tree.
'''
c = Counter()
previous_el_tag = ''
first_el = True
for el in tree.iter():
first_el = False
if (not first_el):
c[el.tag + '+' + previous_el_tag] += 1
previous_el_tag = el.tag
return c_k
# generalization of the above
def sequence_feat(tree, n):
'''
Reads a sequence of lenth n from a tree
'''
c = Counter()
previous_k = []
loaded_n = False
for el in tree.iter():
if loaded_n:
c["+".join([t for t in previous_k])] += 1
previous_k.append(el.tag)
previous_k.pop(0)
else:
previous_k.append(el.tag)
if len(previous_k) == n:
loaded_n = True
return c
In [16]:
r = extract_feats_single_file([lambda x: sequence_feat(x,4)])
In [17]:
def test_feat_extraction(feature_function, setdir = "train"):
'''
arguments:
feature function to run on data for data extraction on the training_set
returns:
extracted features
'''
print "testing feature extraction"
X_train, global_feat_dict, t_train, train_ids = extract_feats([feature_function], setdir)
return X
In [18]:
def save_and_load(featurefile, train_dir, ffs = None, global_feat_dict=None):
# load features from textfile if possible (so we don't waste time recalculating this!)
if ffs is None:
print "loading features from file: {}".format(featurefile)
X_train = util.load_sparse_csr(featurefile + "_mat.npz")
global_feat_dict = json.load(open(featurefile + "_dict.save")) if global_feat_dict is None else global_feat_dict
t_train = np.load(featurefile + "_t_train.npy")
train_ids = np.load(featurefile + "_train_ids.npy")
print "loaded features"
return X_train, global_feat_dict, t_train, train_ids
else:
print "generating feature set and saving to file: {}".format(featurefile)
X_train, global_feat_dict, t_train, train_ids = extract_feats(ffs, train_dir, global_feat_dict)
json.dump(global_feat_dict, open(featurefile + "_dict.save", "w"))
np.save(featurefile + "_train_ids", train_ids)
np.save(featurefile + "_t_train", t_train)
util.save_sparse_csr(featurefile + "_mat", X_train)
print "generated and saved features"
return X_train, global_feat_dict, t_train, train_ids
In [19]:
def toPandasDataFrame(Xarray, feat_dict, classes = None):
'''
arguments:
a sparse numpy matrix of features
a dictionary mapping column indexes to column names
a numpy array of virus type for each element row in Xarray
if none, it simply does not include this information in the matrix
returns:
a pandas dataframe with all features and a final column 'class'
specifying the virus TYPE as discussed in the spec
'''
data = pd.DataFrame(data=Xarray.toarray(), columns=feat_dict)
if classes is not None:
data['class'] = pd.Series(classes)
return data
In [20]:
TYPES = 15 # this includes None
def calculateNormalParams(data, category):
subset = data.loc[data['class'] == category, :'class']
subset.drop('class', axis=1, inplace=True)
return (subset.cov(), subset.mean())
def generative_model(ffs = None, featurefile="generative_features", testfeaturefile="test_generative_features", outputfile = "generative_predictions.csv", train_dir="train", test_dir="test"):
# do a quick load of feature data
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile, train_dir, ffs)
# now we need to train our model using generative baysian statistics
pdFrame = toPandasDataFrame(X_train, global_feat_dict, t_train)
# subset by class and calculate the covariance matrices and the means
normal_params = [calculateNormalParams(pdFrame, i) for i in xrange(TYPES)]
# calculate the normal distributions
normals = [stats.multivariate_normal(mean,cov, allow_singular=True) for cov, mean in normal_params]
# here's our trained classifier model
model = lambda datum: np.argmax([normal.pdf(datum) for normal in normals])
# extract features from test data
print "extracting test features..."
X_test,_, t_ignore, test_ids = save_and_load(testfeaturefile, test_dir, ffs, global_feat_dict=global_feat_dict)
print "done extracting test features"
print
testData = toPandasDataFrame(X_test, global_feat_dict)
# TODO make predictions on text data and write them out
print "making predictions..."
preds = [model(list(el)) for index, el in testData.iterrows()]
print "done making predictions"
print
print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"
return preds
In [24]:
genffs = [system_call_count_feats, tag_counts_feats, first_last_system_call_feats]
In [23]:
generative_model(featurefile="expert_features2", testfeaturefile="expert_features2", outputfile="expert_generative2")
In [21]:
# now we use this section to train a random forest model!
from sklearn.ensemble import RandomForestClassifier
# returns all powers of 2 in interval [m,n]
def powsOf2(m,n):
i = 1
res = []
while i <= n:
if i >= m:
res.append(i)
i *= 2
return res
In [22]:
def extractClass(data, column = 'class'):
'''
Extracts the class column from a data frame and returns (new_frame, class)
'''
y = data[column]
data.drop(column, axis=1, inplace=True)
return (data, y)
def splitForCrossValidation(data, ratio = 3):
'''
Splits input dataframe into so that 1/ratio is training data (1-1/ratio) test data.
Returns a 4-tuple consisting of
(feature data frame for training data, type vector for training data, feature data frame for testing data, and type vector)
'''
split_point = int(1.0/ratio * len(data))
training, training_y = extractClass(data.loc[:split_point])
validation, validation_y = extractClass(data.loc[split_point:])
return (training, training_y, validation, validation_y)
def splitForCrossValidation2(X, y, ratio =3):
split_point = int(1.0/ratio * X.shape[0])
training, training_y = X[:split_point], y[:split_point]
validation, validation_y = X[split_point:], y[split_point:]
return (training.toarray(), training_y, validation.toarray(), validation_y)
def random_forest_optimizer(estimators, ffs = None,featurefile="forest_features", train_dir = "train", test_dir = "test"):
# do a quick load of feature data
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile, train_dir, ffs)
# pdFrame = toPandasDataFrame(X_train, global_feat_dict, t_train)
# split into training set and validation set (training = features dataFrame, validation = features + type dataFrame)
training, training_y, validation, validation_y = splitForCrossValidation(pdFrame)
# training, training_y = extractClass(pdFrame)
# training, training_y, validation, validation_y = cross_validation.train_test_split(training, training_y, test_size = 0.4, random_state=0)
# getAccuracy function returns the accuracy based on training and validation data for a single estimator
def getAccuracy(n):
# train model
model = RandomForestClassifier(n_estimators = n, max_features=64)
model.fit(training,training_y)
# create predictions
preds = model.predict(validation)
return float(reduce(lambda a, (pred, true_val): a + 1 if pred == true_val else a, zip(preds, validation_y), 0))/float(len(preds))
# train on training set for each estimator
results = [(estimator, getAccuracy(estimator)) for estimator in estimators]
return results
In [28]:
random_forest_optimizer([512,1024,2048,4096], featurefile="expert_features3")
Out[28]:
In [27]:
bestFeatureFunctions = [system_call_count_feats, tag_counts_feats, first_last_system_call_feats, longest_sequence_feat,
extract_url_connections, lambda x: sequence_feat(x, 1), lambda x: sequence_feat(x, 1),
lambda x: sequence_feat(x, 2), lambda x: sequence_feat(x, 3), lambda x: sequence_feat(x, 4),
lambda x: sequence_feat(x,5), lambda x: sequence_feat(x,6)]
X_train, global_feat_dict, t_train, train_ids = save_and_load("expert_features3", "train", ffs=bestFeatureFunctions)
#pdFrame = toPandasDataFrame(X_train, global_feat_dict, t_train)
X_train_test, _, t_test, test_ids = save_and_load("expert_test_features3", "test", ffs=bestFeatureFunctions, global_feat_dict=global_feat_dict)
In [23]:
import math
math.sqrt(len(global_feat_dict))
In [24]:
pdFrame = toPandasDataFrame(X_train, global_feat_dict)
del(pdFrame)
In [ ]:
random_forest_optimizer([128,512,1024,2048,4096], featurefile="expert_features2")
In [119]:
%matplotlib
import matplotlib as plt
x = [a for (a,_) in allResults]
y = [b for (_,b) in allResults]
plt.figure(1)
plt.plot(x,y)
In [158]:
datafile = os.listdir("train")[0]
id_str,clazz = datafile.split('.')[:2]
eg_tree = ET.parse(os.path.join("train", datafile))
In [159]:
tag_list = []
for el in eg_tree.iter():
tag_list.append(el.tag)
In [160]:
eg_root = eg_tree.getroot()
In [174]:
seqffs = [system_call_count_feats, tag_counts_feats, first_last_system_call_feats, two_sequences_feat]
In [197]:
random_forest_optimizer(estimators=powsOf2(1,1040), ffs = None, featurefile="forest_features", train_dir = "train", test_dir = "test")
Out[197]:
In [201]:
def random_forest_model(ffs = None, featurefile="forest_features", testfeaturefile="test_forest_features", outputfile = "forest_predictions.csv", train_dir = "train", test_dir = "test"):
# do a quick load of feature data
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile, train_dir, ffs)
pdFrame = toPandasDataFrame(X_train, global_feat_dict, t_train)
# generate random forest model
model = RandomForestClassifier(n_estimators = 1024)
# build a forest of trees from training set (X,y) where X = feature set, Y = target values
y = pdFrame['class']
pdFrame.drop('class', axis=1, inplace=True)
model.fit(pdFrame, y)
# extract features from test data
print "extracting test features..."
X_test,_, t_ignore, test_ids = save_and_load(testfeaturefile, test_dir, ffs, global_feat_dict=global_feat_dict)
print "done extracting test features"
print
testData = toPandasDataFrame(X_test, global_feat_dict)
print(len(testData))
# make predictions on test data and write them out
print "making predictions..."
preds = model.predict(testData)
print "done making predictions"
print
print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"
In [202]:
random_forest_model()
In [26]:
def SVM_model(ffs = None, featurefile="forest_features", testfeaturefile="test_forest_features", outputfile = "SVM_predictions.csv", train_dir = "train", test_dir = "test"):
# do a quick load of feature data
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile, train_dir, ffs)
# pdFrame = toPandasDataFrame(X_train, global_feat_dict)
# generate random forest model
model = RandomForestClassifier(n_estimators = 2700, max_features=300)
# build a forest of trees from training set (X,y) where X = feature set, Y = target values
y = t_train
model.fit(X_train.toarray(), y)
del(X_train)
del(y)
# extract features from test data
print "extracting test features..."
X_test,_, t_ignore, test_ids = save_and_load(testfeaturefile, test_dir, ffs, global_feat_dict=global_feat_dict)
print "done extracting test features"
print
# testData = toPandasDataFrame(X_test, global_feat_dict)
print(X_test.shape[0])
# make predictions on test data and write them out
print "making predictions..."
preds = model.predict(X_test.toarray())
print "done making predictions"
print
del(X_test)
print("Predictions" + str(len(preds)))
print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"
return preds
In [ ]:
# forestffs = [system_call_count_feats, tag_counts_feats, first_last_system_call_feats]
# run with ffs = None and featurefile="generative_features" to use same
# features as generative model (if already generated)
# otherwise, run with ffs as a list of feature functions and specify the output
# name to store the feature files by featurefile param
preds = random_forest_model(featurefile="expert_features3", testfeaturefile="expert_test_features3", outputfile="expert3_features.csv")
In [73]:
from sklearn import linear_model
from sklearn.svm import SVC, LinearSVC
def SVM_optimizer(ffs = None,featurefile="forest_features", train_dir = "train", test_dir = "test"):
# do a quick load of feature data
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile, train_dir, ffs)
pdFrame = toPandasDataFrame(X_train, global_feat_dict, t_train)
# split into training set and validation set (training = features dataFrame, validation = features + type dataFrame)
training, training_y, validation, validation_y = splitForCrossValidation(pdFrame)
# getAccuracy function returns the accuracy based on training and validation data for a single estimator
def getAccuracy():
# train model
model = LinearSVC(C=0.01, class_weight={dict, ‘auto’})
model.fit(training,training_y)
# create predictions
preds = model.predict(validation)
return float(reduce(lambda a, (pred, true_val): a + 1 if pred == true_val else a, zip(preds, validation_y), 0))/float(len(preds))
# train on training set for each estimator
return getAccuracy()
Out[73]:
In [59]:
from random import shuffle
def split_data(train_dir):
'''
Splits train_dir into training set and validation set with validation size 1/3 of total size
Returns: train_data (1/5 of size) and validation_data (2/5 of size)
'''
train_data = os.listdir(train_dir)
shuffe(train_data)
validation_size = len(train_data) / 5
validation_data = train_data[0:validation_size]
train_data = train_data[validation_size:]
return (train_data, validation_data)
In [148]:
forestffs = genffs
# run with ffs = None and featurefile="generative_features" to use same
# features as generative model (if already generated)
# otherwise, run with ffs as a list of feature functions and specify the output
# name to store the feature files by featurefile param
# this uses the features stored in forest_features files
random_forest_model()
In [140]:
# we're going to generate the modified bag_of_words feature dataset and train it on random forests
forestffsWithBag = forestffs + [bag_of_words_feat]
random_forest_model(ffs=forestffsWithBag, featurefile="bag_forest_features", testfeaturefile="bag_test_forest_features", outputfile="bag_forest_predictions.csv")