127.0.0.1 - - [01/Aug/1995:00:00:01 -0400] "GET /images/launch-logo.gif HTTP/1.0" 200 1839
127.0.0.1: IP address (host name, if available) of the client which made the request to the server.
-: User identity from remote machine, not available.
-: User identity from local logon, not available.
[01/Aug/1995:00:00:01 -0400]: Date and time that the server finished processing the request.
GET: The request method (e.g., GET, POST, etc.)
/images/launch-logo.gif: The endpoint (a Uniform Resource Identifier, URI)
HTTP/1.0: The client protocol version
200: Status code that the server sends back to the client.
1839: Size of the object returned to the client.
sc.textfile(logFile) to convert each line of the file into an element in an RDD. The data is stored in the file 'data/apachelog.txt'.
In [3]:
    
RDDlogs = (sc
            .textFile('data/apachelog.txt')  
           )
Nlines = RDDlogs.<COMPLETAR>()
print "The log file has %d lines" % Nlines
print " "
first_10 = RDDlogs.<COMPLETAR>(10)
print "The first 10 elements are:"
print " "
for x in first_10:
    print x
    
    
The first 10 elements are:
['"in24.inetnebr.com', '-', '-', '1995-08-01 00:00:01', 'GET', '/shuttle/missions/sts-68/news/sts-68-mcc-05.txt', 'HTTP/1.0', '200', '1839"']
['"uplherc.upl.com', '-', '-', '1995-08-01 00:00:07', 'GET', '/', 'HTTP/1.0', '304', '0"']
['"uplherc.upl.com', '-', '-', '1995-08-01 00:00:08', 'GET', '/images/ksclogo-medium.gif', 'HTTP/1.0', '304', '0"']
['"uplherc.upl.com', '-', '-', '1995-08-01 00:00:08', 'GET', '/images/MOSAIC-logosmall.gif', 'HTTP/1.0', '304', '0"']
['"uplherc.upl.com', '-', '-', '1995-08-01 00:00:08', 'GET', '/images/USA-logosmall.gif', 'HTTP/1.0', '304', '0"']
['"ix-esc-ca2-07.ix.netcom.com', '-', '-', '1995-08-01 00:00:09', 'GET', '/images/launch-logo.gif', 'HTTP/1.0', '200', '1713"']
['"uplherc.upl.com', '-', '-', '1995-08-01 00:00:10', 'GET', '/images/WORLD-logosmall.gif', 'HTTP/1.0', '304', '0"']
['"slppp6.intermind.net', '-', '-', '1995-08-01 00:00:10', 'GET', '/history/skylab/skylab.html', 'HTTP/1.0', '200', '1687"']
['"piweba4y.prodigy.com', '-', '-', '1995-08-01 00:00:10', 'GET', '/images/launchmedium.gif', 'HTTP/1.0', '200', '11853"']
['"slppp6.intermind.net', '-', '-', '1995-08-01 00:00:11', 'GET', '/history/skylab/skylab-small.gif', 'HTTP/1.0', '200', '9202"']
In [4]:
    
def str2list(x):
    return <COMPLETAR>
RDDlist = (RDDlogs
               .map(lambda x: str2list(x))
                .cache()
         )
first_10 = RDDlist.take(10)
print "The first 10 elements are:"
print " "
for x in first_10:
    print x
    
    
We have extracted 977769 sizes, and the unique values are:
[9, 10, 11, 1]
In [5]:
    
Element_sizes = (RDDlist
               .map(lambda x: <COMPLETAR>)
                .collect()
         )
Nsizes = len(Element_sizes)
Unique_values = list(set(Element_sizes))
print "We have extracted %d sizes, and the unique values are:" % Nsizes
print Unique_values
    
    
Lengths and counts:
[(9, 964781), (1, 12986), (10, 1), (11, 1)]
In [6]:
    
pairs_count = (RDDlist
               .map(lambda x: <COMPLETAR>)
               .reduceByKey(<COMPLETAR>)
               .collect()
         )
print "Lengths and counts:"
print pairs_count
    
    
The size of the cleaned RDD is 964781
In [7]:
    
cleanRDD = (RDDlist
                     .filter(lambda x: <COMPLETAR>)
                     .cache()
                 )
size_cleanRDD = cleanRDD.count()
print "The size of the cleaned RDD is %d" % size_cleanRDD
    
    
In [8]:
    
unique_field4 = (cleanRDD
                     .map(lambda x: <COMPLETAR>)
                     .reduceByKey(lambda x, y: x + y)
                     .collect()
                 )
print unique_field4
    
    
[('200', 881815), ('304', 74774), ('404', 5835), ('302', 2293), ('403', 47), ('501', 15), ('500', 2)]
In [9]:
    
unique_field7 = (cleanRDD
                     .<COMPLETAR>
                     .cache()
                 )
N_elements = len(unique_field7.collect())
                 
ordered_elements = unique_field7.takeOrdered(N_elements, lambda x: -x[1])
print ordered_elements
    
    
These are the first 10 content values:
[1839, 0, 0, 0, 0, 1713, 0, 1687, 11853, 9202]
This is the minimum value:
0
This is the maximum value:
3421948
This is the average value using the 'mean' function:
17840.7790929
This is the average value WITHOUT using the 'mean' function:
17840.7790929
In [10]:
    
content_sizesRDD = (cleanRDD
                     .map(lambda x: int(x[8].replace('"','')))
                    .cache()
                 )
print "These are the first 10 content values:"
print content_sizesRDD.<COMPLETAR>(10)
print " "
print "This is the minimum value:"
print content_sizesRDD.<COMPLETAR>()
print " "
print "This is the maximum value:"
print content_sizesRDD.<COMPLETAR>()
print " "
print "This is the average value using the 'mean' function:"
print content_sizesRDD.<COMPLETAR>()
print " "
print "This is the average value WITHOUT using the 'mean' function:"
print <COMPLETAR>
print " "
    
    
This is the sorted count of the 10 most frequent hosts:
edams.ksc.nasa.gov : 3737
piweba5y.prodigy.com : 3067
piweba4y.prodigy.com : 2690
piweba3y.prodigy.com : 2658
www-d1.proxy.aol.com : 2591
news.ti.com : 2358
163.206.89.4 : 2317
www-b2.proxy.aol.com : 2289
www-b3.proxy.aol.com : 2254
www-d2.proxy.aol.com : 2229
In [12]:
    
hostcountRDD = (cleanRDD
                     .map(lambda x: <COMPLETAR>)
                     .reduceByKey(lambda x, y: x + y)
                     .cache()
                 )
pairs_of_ten_most_frequent_hosts = hostcountRDD.<COMPLETAR>
list_of_ten_most_frequent_hosts = [x[0].replace('"','') for x in pairs_of_ten_most_frequent_hosts]
filtered_host_RDD = (cleanRDD
                     .filter(lambda x: x[0].replace('"','') in list_of_ten_most_frequent_hosts)
                     )
host_freq_RDD = (filtered_host_RDD
                     .map(lambda x: <COMPLETAR>)
                     .reduceByKey(<COMPLETAR>)
                     .cache()
                 )
ten_most_frequent_hosts_count = (host_freq_RDD
                                     .takeOrdered(20, lambda x: -x[1])
                                 )
print "These are the counts for the 10 most frequent hosts:"
print " "
for x in ten_most_frequent_hosts_count:
    print x[0].replace('"','') + " : " + str(x[1])
print " "
    
    
[(1, 33668), (3, 40828), (4, 58822), (5, 31446), (6, 31957), (7, 56672), (8, 59367), (9, 59708), (10, 60458), (11, 60503), (12, 37331), (13, 35840), (14, 59091), (15, 58029), (16, 55965), (17, 58182), (18, 55508), (19, 31615), (20, 32546), (21, 47245)]
In [14]:
    
def get_day(date_string):
    day = <COMPLETAR>
    return day
pairs_day_host_RDD = (cleanRDD
                     .map(lambda x: <COMPLETAR>)
                 )
pairs_day_host_RDD.take(3)
uniquehosts = (pairs_day_host_RDD
                    .<COMPLETAR>
                    .map(lambda x: <COMPLETAR>)
                   .takeOrdered(30, lambda x: x[0])
               )
print uniquehosts
days = <COMPLETAR>
hosts = <COMPLETAR>
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(8,4.5), facecolor='white', edgecolor='white')
plt.axis([min(days), max(days), 0, max(hosts)+500])
plt.grid(b=True, which='major', axis='y')
plt.xlabel('Day')
plt.ylabel('Hosts')
plt.plot(days, hosts)
pass
#print get_day(pairs_day_host_RDD.take(1)[0][0])
    
    
    
These are the URIs that produced more 404 errors:
588         /pub/winvn/readme.txt
457         /pub/winvn/release.txt
411         /shuttle/missions/STS-69/mission-STS-69.html
319         /images/nasa-logo.gif
168         /elv/DELTA/uncons.htm
147         /shuttle/missions/sts-68/ksc-upclose.gif
144         /history/apollo/sa-1/sa-1-patch-small.gif
116         /images/crawlerway-logo.gif
114         /://spacelink.msfc.nasa.gov
91      /history/apollo/a-001/a-001-patch-small.gif
In [15]:
    
URIs_404 = (cleanRDD
                    .filter(<COMPLETAR>)
                    .map(lambda x: <COMPLETAR>)
                    .reduceByKey(lambda x, y: x + y)
                    .takeOrdered(10, lambda x:<COMPLETAR>)
                 )
print "These are the URIs that produced more 404 errors:"
print " "
for u in URIs_404:
    print u[1], 2*'\t', u[0]
    
    
In [ ]: