notebook.community

Edit and run



In [ ]:

    
# This cell defines the read_apache_log_to_dataframe method, which accepts an opened file.  We will define the file
#     that is being used in the next cell, and invoke this function to create our dataframe
import pandas as pd
import re

def read_apache_log_to_dataframe(opened_file_to_read):
    #If file is there, returns a dataframe composed of the lines that successfully match the access_log_regex.  If file
    #    does not exist, returns 'FileNotFound'
    
    #Apache Log Format: "%{JiveClientIP}i %l %u %t \"%r\" %>s %b %T %k \"%{Referer}i\" \"%{User-Agent}i\" \"%{Content-Type}o\" 
    #                    %{JSESSIONID}C" common
    #This regex parses a log line into the above items, with %r being separated into the request method, request URI, and HTTP 
    #    version.  The Time to serve request claims to be in milliseconds, but I think for some versions of Jive this is
    #    not true.  Sometimes it will be in whole seconds.
    access_log_regex = r'(\S+) (\S+) (\S+) \[(\S+ \S+)\] "(\S+) (\S+) (\S+)" (\S+) (\S+) (\S+) (\S+) (".*?") (".*?") (".*?") (\S+)'
    compiled_regex = re.compile(access_log_regex)
    file_contents = []
    line_dict = {}
    
    for line in f:
        regex_match = compiled_regex.search(line)

        try:
            line_dict['ip_address'] = regex_match.group(1)
            line_dict['remote_username'] = regex_match.group(2)
            line_dict['remote_user'] = regex_match.group(3)
            line_dict['request_received_timestamp'] = regex_match.group(4)
            line_dict['http_request_method'] = regex_match.group(5)
            line_dict['http_request_uri'] = regex_match.group(6)
            line_dict['http_request_version'] = regex_match.group(7)
            line_dict['http_status'] = regex_match.group(8)
            line_dict['response_size_bytes'] = regex_match.group(9)
            line_dict['time_to_serve_request_milliseconds'] = regex_match.group(10)
            line_dict['keep_alive_requests'] = regex_match.group(11)
            line_dict['referer'] = regex_match.group(12)
            line_dict['user_agent'] = regex_match.group(13)
            line_dict['content_type'] = regex_match.group(14)
            line_dict['jsession_id'] = regex_match.group(15)

            file_contents.append(line_dict.copy())

        #AttributeError thrown when trying to assign matched groups if a match was not found on that line (i.e. match = None)
        except AttributeError:
            print('Attribute Error for:')
            print(line)
            pass

    data_frame = pd.DataFrame(file_contents)
    return data_frame



In [ ]:

    
# Here we define which file to use and read that file into a dataframe

path = '/Users/daniel.harada/GitHub/Apache-Access-Log-Tools/Test.log'
with open(path) as f:
    data_frame = read_apache_log_to_dataframe(f)



In [ ]:

    
# Here and below we define whatever aggregations we need

# First a simple example of getting counts by IP address
data_frame['ip_address'].value_counts().head(10)



In [ ]:

    
# This cell gives a more complicated example, where we get the counts by top 10 IP address from a particular User Agent
data_frame[data_frame['user_agent'] == '"Mozilla/5.0 (Linux; Android 4.4.4; en-us; SAMSUNG SM-N910T Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/2.0 Chrome/34.0.1847.76 Mobile Safari/537.36"']['ip_address'].value_counts().head(10)



In [ ]: