In [ ]:
import os
import sys
import re
''' ideally you get a set of good hits from QA when they release a site to production
    Most things aren't ideal, so here's a way to simply separate likely good from likely bad
    based on http status return codes
'''

inputfile=sys.argv[1]
goodfile=open('{0}.good'.format(inputfile),'wb')
badfile=open('{0}.bad'.format(inputfile),'wb')


apachequotedfieldsre=re.compile(r'''"(.*?)"''')     #get fields delimited by ""
apachestatusre=re.compile(r''' ([0-9]{3}) ''')      #get 3 digit http status field

for line in open(inputfile).readlines():
    if len(apachestatusre.findall(line))>0:
        if int(apachestatusre.findall(line)[0])<400:
            goodfile.write('{0}\n'.format(apachequotedfieldsre.findall(line)[0].replace('GET ',''))) #url only.
            #goodfile.write('{0} {1}\n'.format( int(apachestatusre.findall(line)[0]), ' '.join(apachequotedfieldsre.findall(line)[0:1])))
        else:
            #write status, verb, url
            badfile.write('{0} {1}\n'.format( int(apachestatusre.findall(line)[0]), ' '.join(apachequotedfieldsre.findall(line)[0:1])))

goodfile.close()
badfile.close()