This answer section is for those that have successfully completed the parser code lab and are looking for one example answer.
Just to re-iterate, this is only one example answer, this can be done in several different ways, some might even be considerably better than the supplied one. With that caveat... please read on:
In [ ]:
#!/usr/bin/python
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parser for an example simple log file, used as a code lab example."""
import logging
# A library that contains the EventObject and few convenience EventObjects.
from plaso.events import time_events
# This library contains formatters and timestamp information, such as timestamp
# description "ontology" or shorthands.
from plaso.lib import eventdata
# A library that contains all timestamp manipulations.
from plaso.lib import timelib
# Import the parser manager.
from plaso.parsers import manager
# A library that contains assistants for text parsing.
from plaso.parsers import text_parser
Start by saving the example log file so we can start working with it.
In [ ]:
import tempfile
mystery_path = u''
# The content of the mystery log file.
text_to_enter = r"""date,user,request,result,action,extra
03/04/2013 21:20:32.3411,foobar,get all the data,no data here,GET,some weird stuff happening here
03/12/2013 13:41:52.1231,foobar,get all the data,"data loading,stuff happening, here it is",GET,1235 bytes transferred
06/05/2013 02:02:12.3421,john,insert data,success,PUT,1023 bytes inserted into attribute database
07/12/2013 06:23:42.7019,john,get all the data,no data here,GET,seems to be an empty database
12/24/2013 18:00:00.1234,dude,get some data,"gathering data, please wait",GET,10 bytes transferred
01/14/2014 12:24:10.562,mike,get some data,"gathering data,please wait",GET,123155324 bytes transferred
"""
# Write the content of the buffer to a temporary file.
with tempfile.NamedTemporaryFile(delete=False) as fh:
mystery_path = fh.name
fh.write(text_to_enter)
print u'Test file created: {}'.format(mystery_path)
Here is one example of the solution to parsing this mysterious log file.
In [ ]:
class MysteryExampleParser(text_parser.TextCSVParser):
"""Parses the example mysterious text file created for this codelab."""
# Typically this would be a very descriptive name, such as "myproduct_accesslog" or "syslog"
# or something in that genre, but for the purposes of the code lab we name it something distinct
# and vague at the same time.
NAME = 'codelab_mystery_logfile'
DESCRIPTION = 'Parses the example mysterious text file created for this codelab.'
# We don't need to set this, since the file is already comma separated and that is the default
# value of the text parser.
#VALUE_SEPARATOR = ''
# There is a single header line for this format, so we need to define this value here.
NUMBER_OF_HEADER_LINES = 1
# We can also notice that some lines do have commas in them, and that they use a quote character
# to denote that. So we do need to set this variable if we want the text file to be properly
# parsed.
QUOTE_CHAR = '"'
# The column definition is quite simple in this case, since we do have that in the header itself.
COLUMNS = [u'date', u'user', u'request', u'result', u'action', u'extra']
# Define the valid action fields.
_ACTION_FIELDS = [u'GET', u'PUT']
def VerifyRow(self, parser_context, row):
"""Verify that this is truly our mystery file."""
# We need this to be both accurate and fast. That is we need this function here to accurately
# determine if this is truly the correct log file, and we need it done quickly (since a lot of
# files will be sent here for verification).
# One thing to notice is that we don't need to verify the number of columns here, that is already
# done, we can assume that if we get to this function that we have the following conditions met:
# 1. This is a text file (although that check sometimes fail, so really we can make the
# assumption that this is very likely a text file).
# 2. That the text file has the correct number of columns set, that is it is len(self.COLUMNS).
# We then need to return either True or False (and we can do some debug logs as well)
# One thing we notice, and this is something that would obviously need to be clearly
# examined and confirmed is that the field "action" seems to have fixed values, so we can
# check to see if we have a valid action field.
if row['action'] not in self._ACTION_FIELDS:
logging.debug(u'Unknown action field <{:s}>, not a {:s}'.format(
row['action'], self.parser_name))
return False
# We can also verify that the date is actually set and produces a valid timestamp.
timestamp = timelib.Timestamp.FromTimeString(row['date'])
if not timestamp:
logging.debug(u'Timestamp: <{}> not correctly formed.'.format(row['date']))
return False
# We might make some additional verification here, but for now let this be enough.
return True
def ParseRow(self, parser_context, row_offset, row, file_entry=None):
"""Parse a single row from the mysterious log file."""
# This is the call back function that gets called for each row parsed in that text file.
# We therefore need to yield an EventObject for each row.
# This can be done in several different ways, we can for instance calculate the timestamp
# here and pass that on to an EventObject, and process some of the arguments in the row
# or simply just pass them all into a convenience EventObject and have that take care of it.
# The python notion of **attribute takes a dict object and passes that on as parameters to the
# function, an example:
# a = {'foo': 'bar', 'stuff': 'more'}
# FooFunction(**a)
# Would be equivilant to:
# FooFunction(foo='bar', stuff='more')
#
# For the purpose of this codelab we can assume that the text file logs timestamps in
# a local time zone of the computer that generated the log entry, so we need to pass in
# the timezone.
event_object = MysteryLogEvent(row_offset, parser_context.timezone, **row)
parser_context.ProduceEvent(
event_object, parser_name=self.NAME, file_entry=file_entry)
manager.ParsersManager.RegisterParser(MysteryExampleParser)
Since we decided to "outsource" all the processing to the event object itself we need to define that and have it contain at least some minimum logic.
In [ ]:
class MysteryLogEvent(time_events.TimestampEvent):
"""A convenience EventObject for events extracted from the mysterious log file."""
DATA_TYPE = 'text:codelab:mystery'
def __init__(self, offset, zone, date, user, request, result, action, extra):
"""Initializes the mysterious log file event.
Args:
offset: Offset to where the record lies.
zone: The timezone of the record.
date: A string containing the timestamp of the event, in the form of: "MM/DD/YYYY HH:MM:SS.MS".
user: The username that made the query.
request: String containing the request.
result: Result string that comes back from the request.
action: A short string containing the type of request made, ex: GET, PUT.
extra: String containing extra information from the record.
"""
# Calculate the timestamp based on the timezone of the computer this record
# got extracted from.
timestamp = timelib.Timestamp.FromTimeString(date, zone)
# Call the parent class, this is necessary BTW to set additional attributes, etc.
# The parent of TimestampEvent needs the following:
# timestamp: The timestamp of the event.
# timestamp_desc: The description of the timestamp, eg: "Last Written Time", etc.
# data_type: The data type of the event object (remember 1:1 mapping to the formatter).
super(MysteryLogEvent, self).__init__(
timestamp, eventdata.EventTimestamp.WRITTEN_TIME, self.DATA_TYPE)
# Set the other attributes of the event as well.
self.request = request
self.result = result
self.action = action
self.extra = extra
self.offset = offset
# Notice that we set the username differently, that is it is not stored as "self.user".
# This may not be obvious, but we need to have some consistency despite the event objects
# being loosely defined. For that we have constructed (albeit ATM very limited) ontology
# to make sure we name attribute names that have the same meaning consistently across
# parsers. That makes filtering and output considerably easier.
# Please refer to: http://plaso.kiddaland.net/developer/ontology
self.username = user
[optional code segment] Remember if you make changes to the parser you need to de-register it before you run the code segment again (to register it):
In [ ]:
# OPTIONAL DO NOT EXECUTE UNLESS YOU'VE MADE CHANGES TO THE PARSER CODE ABOVE AND
# NEED TO REGISTER THOSE CHANGES!
manager.ParsersManager.DeregisterParser(MysteryExampleParser)
And then we need to define the formatter, so that the message strings can be properly formatted.
In [ ]:
from plaso import formatters
from plaso.formatters import interface as formatter_interface
from plaso.formatters import manager as formatter_manager
class MysteryLogFormatter(formatter_interface.ConditionalEventFormatter):
"""Class that formats events from the mysterious log file."""
# Remember, this has to be a 1:1 mapping to the data type defined in the event object.
DATA_TYPE = 'text:codelab:mystery'
# The format string needs to include ALL the attributes we deem to be important enough
# enough to be part of the message string, EXCLUDING those that are provided by other
# fields. What attributes are included in other fields? That may not be as clear or
# easily discovered, but typically that is:
# hostname
# username
# That is information that you would typically associate to a common enough field that
# would be included by default in the output formatting.
#
# Also remember, each attribute name should be contained within brackets ({}) and only
# one attribute name per entry in the list.
FORMAT_STRING_PIECES = [
u'Request: [{action}]',
u'{request}',
u'With answer: {result}',
u'Additional information: {extra}']
FORMAT_STRING_SHORT_PIECES = [u'[{action}]:', u'{request}']
SOURCE_LONG = 'Mysterious Log File'
SOURCE_SHORT = 'LOG'
# This is not part of the "regular code", this is just added to make this codelab work.
formatter = MysteryLogFormatter()
if not hasattr(formatter_manager.EventFormatterManager, 'event_formatters'):
formatter_manager.EventFormatterManager.event_formatters = {}
formatter_manager.EventFormatterManager.event_formatters[formatter.DATA_TYPE] = formatter
[optional code segment] If you make changes to the formatter, you need to remove it from the registration before you register it again.
In [ ]:
# This is OPTIONAL code, no need to run unless you make changes to the formatter and want to update
# the registration (as in get the changes checked in).
# You may need to change this to reflect the class name of the formatter.
formatter_name = 'MysteryLogFormatter'
# No need to change the code below.
formatter_data_type = ''
if formatter_name in formatter_manager.DefaultFormatter.classes:
print 'Formatter registered, removing.'
formatter_data_type = formatter_manager.DefaultFormatter.classes[formatter_name].DATA_TYPE
del formatter_manager.DefaultFormatter.classes[formatter_name]
print 'Formatter deleted.'
if formatter_data_type and formatter_data_type in formatter_manager.EventFormatterManager.event_formatters:
print 'Removing the data type [{}] from the registration.'.format(formatter_data_type)
del formatter_manager.EventFormatterManager.event_formatters[formatter_data_type]
And finally we need to create a unit test to make sure we are parsing the file properly.
In [ ]:
import unittest
from plaso.parsers import test_lib
class MysteryParserTest(test_lib.ParserTestCase):
"""Tests for the mysterious log file parser."""
def setUp(self):
"""Sets up the needed objects used throughout the test."""
self._parser = MysteryExampleParser()
def testParse(self):
"""Tests the Parse function."""
test_file = mystery_path
event_queue_consumer = self._ParseFile(self._parser, test_file)
event_generator = self._ParseFile(self._parser, test_file)
event_objects = self._GetEventObjectsFromQueue(event_queue_consumer)
# At bare minimum we need to test that the parser successfully parsed
# all the lines.
self.assertEquals(len(event_objects), 6)
# Read in at least one (or more) event objects and make sure we are parsing
# them correctly. For instance let's test time parsing.
event_object = event_objects[1]
# We may need to adjust this to the timezone of the machine. Here we make
# the assumption that the file was gathered from a machine that used
# UTC as it's timezone, we may want to change that assumption.
self.assertEquals(event_object.timestamp, 1363095712123100)
# Add here some tests to make sure we are parsing/extracting attribute names.
# And now we need to test our formatter, create a message string and test it.
expected_msg = (
u'Request: [GET] get all the data With answer: data loading,stuff happening, here it is '
u'Additional information: 1235 bytes transferred')
expected_msg_short = u'[GET]: get all the data'
self._TestGetMessageStrings(event_object, expected_msg, expected_msg_short)
# Let's make few more tests at least.
self.assertEquals(event_object.action, u'GET')
self.assertEquals(event_object.username, 'foobar')
# Did you notice that one event object (there were more than one, but we can pick one)
# that had the quote character, let's test to see if that was successfull.
event_object = event_objects[4]
# The result field was the one we were interested in.
self.assertEquals(event_object.result, u'gathering data, please wait')
self.assertEquals(event_object.username, 'dude')
And run these tests to make sure we have everything covered.
In [ ]:
my_suite = unittest.TestSuite()
my_suite.addTest(MysteryParserTest('testParse'))
results = unittest.TextTestRunner(verbosity=3).run(my_suite)
if results.errors:
print u'Errors came up while trying to run test.'
for error in results.errors:
if isinstance(error, basestring):
print error
else:
for sub_error in error:
print sub_error
elif results.failures:
print u'Failures came up while trying to run test.'
for failure in results.failures:
if isinstance(failure, basestring):
print failure
else:
for sub_failure in failure:
print sub_failure
else:
print u'All came out clean.'
print results
Another example of the solution, would be something like this... remember that there are plenty of solutions available:
In [ ]:
class AnotherMysteryLogEvent(time_events.TimestampEvent):
"""A convenience EventObject for events extracted from the mysterious log file."""
# We are going to use the same data type as in the other solution, just
# changing some of our approach.
DATA_TYPE = 'text:codelab:mystery'
def __init__(self, timestamp, user, request, result, action, extra):
"""Initializes the mysterious log file event.
Args:
timestamp: The timestamp when the entry was created.
user: The username that made the query.
request: String containing the request.
result: Result string that comes back from the request.
action: A short string containing the type of request made, ex: GET, PUT.
extra: String containing extra information from the record.
"""
# The only difference here and the other approach is that we pass in
# the timestamp directly here, and thus we can skip the zone and date
# attributes.
super(AnotherMysteryLogEvent, self).__init__(
timestamp, eventdata.EventTimestamp.WRITTEN_TIME, self.DATA_TYPE)
# Set the other attributes of the event as well.
self.request = request
self.result = result
self.action = action
self.extra = extra
# Notice that we set the username differently, that is it is not stored as "self.user".
# This may not be obvious, but we need to have some consistency despite the event objects
# being loosely defined. For that we have constructed (albeit ATM very limited) ontology
# to make sure we name attribute names that have the same meaning consistently across
# parsers. That makes filtering and output considerably easier.
# Please refer to: http://plaso.kiddaland.net/developer/ontology
self.username = user
class AnotherMysteryExampleParser(text_parser.TextCSVParser):
"""Parses the example mysterious text file created for this codelab."""
# Keep this to something very similar as the other name, except slightly different.
NAME = 'codelab_mystery_logfile_take_two'
DESCRIPTION = 'Parses the example mysterious text file created for this codelab.'
# Keep the same header line and quote character.
NUMBER_OF_HEADER_LINES = 1
QUOTE_CHAR = '"'
# The column definition is quite simple in this case, since we do have that in the header itself.
COLUMNS = [u'date', u'user', u'request', u'result', u'action', u'extra']
# Define the valid action fields.
_ACTION_FIELDS = [u'GET', u'PUT']
def VerifyRow(self, parser_context, row):
"""Verify that this is truly our mystery file."""
# There is no need to change this from the original one.
if row['action'] not in self._ACTION_FIELDS:
logging.debug(u'Unknown action field <{:s}>, not a {:s}'.format(
row['action'], self.parser_name))
return False
# We can also verify that the date is actually set and produces a valid timestamp.
timestamp = timelib.Timestamp.FromTimeString(row['date'])
if not timestamp:
logging.debug(u'Timestamp: <{}> not correctly formed.'.format(row['date']))
return False
# We might make some additional verification here, but for now let this be enough.
return True
def ParseRow(self, parser_context, row_offset, row, file_entry=None):
"""Parse a single row from the mysterious log file."""
# Let's change the ParseRow call back function here.
# Calculate the timestamp.
timestamp = timelib.Timestamp.FromTimeString(row['date'], parser_context.timezone)
# Here (difference between the first approach) we check to see if we get
# a proper timestamp value out of the logline and if not we don't produce
# an event object. If the log file would contain "corrupted" or "bad" date
# strings that would result in an error in the FromTimeString function we
# would produce an event object with the timestamp of ZERO (seen as coming
# from Epoch time). The questio is, do we want that behavior, or do we want
# to ignore such records? This is a decision we need to make in the parser.
# Here in this approach we've decided we do not want to include such records.
if not timestamp:
logging.debug(u'Unable to extract timestamp from row: <{}>'.format(u' - '.join(
row.values())))
return
# Now we have a timestamp and we can create the event object.
event_object = AnotherMysteryLogEvent(
timestamp, row['user'], row['request'], row['result'], row['action'],
row['extra'])
parser_context.ProduceEvent(
event_object, parser_name=self.NAME, file_entry=file_entry)
Let's create another test case for this parser, basing it on the previous one so we can re-use most parts of it.
In [ ]:
class AnotherMysteryParserTest(MysteryParserTest):
"""Tests for the mysterious log file parser."""
def setUp(self):
"""Sets up the needed objects used throughout the test."""
self._parser = AnotherMysteryExampleParser()
And let's run all the tests:
In [ ]:
my_suite = unittest.TestSuite()
my_suite.addTest(AnotherMysteryParserTest('testParse'))
results = unittest.TextTestRunner(verbosity=3).run(my_suite)
if results.errors:
print u'Errors came up while trying to run test.'
for error in results.errors:
if isinstance(error, basestring):
print error
else:
for sub_error in error:
print sub_error
elif results.failures:
print u'Failures came up while trying to run test.'
for failure in results.failures:
if isinstance(failure, basestring):
print failure
else:
for sub_failure in failure:
print sub_failure
else:
print u'All came out clean.'
print results
In [ ]:
import os
if mystery_path:
os.remove(mystery_path)