In [ ]:
# This cell defines the read_apache_log_to_dataframe method, which accepts an opened file. We will define the file
# that is being used in the next cell, and invoke this function to create our dataframe
import pandas as pd
import re
def read_apache_log_to_dataframe(opened_file_to_read):
#If file is there, returns a dataframe composed of the lines that successfully match the access_log_regex. If file
# does not exist, returns 'FileNotFound'
#Apache Log Format: "%{JiveClientIP}i %l %u %t \"%r\" %>s %b %T %k \"%{Referer}i\" \"%{User-Agent}i\" \"%{Content-Type}o\"
# %{JSESSIONID}C" common
#This regex parses a log line into the above items, with %r being separated into the request method, request URI, and HTTP
# version. The Time to serve request claims to be in milliseconds, but I think for some versions of Jive this is
# not true. Sometimes it will be in whole seconds.
access_log_regex = r'(\S+) (\S+) (\S+) \[(\S+ \S+)\] "(\S+) (\S+) (\S+)" (\S+) (\S+) (\S+) (\S+) (".*?") (".*?") (".*?") (\S+)'
compiled_regex = re.compile(access_log_regex)
file_contents = []
line_dict = {}
for line in f:
regex_match = compiled_regex.search(line)
try:
line_dict['ip_address'] = regex_match.group(1)
line_dict['remote_username'] = regex_match.group(2)
line_dict['remote_user'] = regex_match.group(3)
line_dict['request_received_timestamp'] = regex_match.group(4)
line_dict['http_request_method'] = regex_match.group(5)
line_dict['http_request_uri'] = regex_match.group(6)
line_dict['http_request_version'] = regex_match.group(7)
line_dict['http_status'] = regex_match.group(8)
line_dict['response_size_bytes'] = regex_match.group(9)
line_dict['time_to_serve_request_milliseconds'] = regex_match.group(10)
line_dict['keep_alive_requests'] = regex_match.group(11)
line_dict['referer'] = regex_match.group(12)
line_dict['user_agent'] = regex_match.group(13)
line_dict['content_type'] = regex_match.group(14)
line_dict['jsession_id'] = regex_match.group(15)
file_contents.append(line_dict.copy())
#AttributeError thrown when trying to assign matched groups if a match was not found on that line (i.e. match = None)
except AttributeError:
print('Attribute Error for:')
print(line)
pass
data_frame = pd.DataFrame(file_contents)
return data_frame
In [ ]:
# Here we define which file to use and read that file into a dataframe
path = '/Users/daniel.harada/GitHub/Apache-Access-Log-Tools/Test.log'
with open(path) as f:
data_frame = read_apache_log_to_dataframe(f)
In [ ]:
# Here and below we define whatever aggregations we need
# First a simple example of getting counts by IP address
data_frame['ip_address'].value_counts().head(10)
In [ ]:
# This cell gives a more complicated example, where we get the counts by top 10 IP address from a particular User Agent
data_frame[data_frame['user_agent'] == '"Mozilla/5.0 (Linux; Android 4.4.4; en-us; SAMSUNG SM-N910T Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/2.0 Chrome/34.0.1847.76 Mobile Safari/537.36"']['ip_address'].value_counts().head(10)
In [ ]: