In [47]:
import os
from BeautifulSoup import BeautifulSoup
import re

cr = open('/home/weizhou/Projects/scripts/input/service/cr', 'r')
top_dir = '/home/weizhou/Projects/scripts/input/cache/'

pre = 'https://docs.engineering.redhat.com/display/KB/'

service_code_reg = re.compile(r'(FS|CR|LB)-[\d]{3}-[\d]{2}-[\d]{3}-v[\d]{1}')

#1	Engineering Operations - CORE
#2	Engineering Operations - LAB
#3	Engineering Operations - FLS
team_dict = {'CR': '1',
             'LB': '2',
             'FS': '3',
             'CA': '4',
             'SA': '5',
             'SE': '6',
             }

In [48]:
cr_service_dict = dict()
#cr service process
cr_content = cr.read()
cr_soup = BeautifulSoup(cr_content)
for cr_table in cr_soup.findAll('div', 'table-wrap'):
    trs = cr_table.table.tbody.findAll('tr')
    #skip first 2lines
    trs = trs[2:]
    for tr in trs:
        if tr.contents[0].string:
            if tr.contents[0].string == u' ':
                continue
            code = tr.contents[0].string.strip()
            service_name = tr.contents[1].a.contents[0].strip()
        elif tr.contents[0].strong:
            code = tr.contents[0].strong.string.strip()
            service_name = tr.contents[1].strong.a.contents[0].strip()
        elif tr.contents[0].p:
            code = tr.contents[0].p.string.strip()    
            service_name = tr.contents[1].p.a.contents[0].strip()
        else:
            continue

        cr_service_dict[code.split('-')[1]] = service_name

In [49]:
code_dict = dict()

In [50]:
def extract_code(html_file, file_name):
    with open(html_file) as inf:
        content = inf.read()
        
    soup = BeautifulSoup(content)
    data = soup.find('h3', text='Breakdown of tasks performed')
    if data:
        data = data.parent.nextSibling
        while data:
            if data.table:
                break
            data = data.nextSibling
        
        data = data.table
        if not data:
            print '%s%s has no service code' % (pre, file_name)
            return

        data =data.findAll('tr')
        if len(data) <= 1:
            print '%s%s has no service code' % (pre, file_name)
            return
    
        for tr in data[1:]:
            tds = tr.findAll('td')[0:2]
            code_list = getattr(tds[0], 'text', None).split('-')
            if len(code_list) >= 4:

                if code_list[1] == 'SSS':
                    print "CODE ERROR:", getattr(tds[0], 'text', None), ' FROM ', pre+file_name
                    continue
                if cr_service_dict.has_key(code_list[1]) is False:
                    continue
                code_dict[getattr(tds[0], 'text', None)] = [getattr(tds[0], 'text', None),
                                                            cr_service_dict[code_list[1]],
                                                            team_dict[code_list[0]],
                                                            code_list[3],
                                                            getattr(tds[1], 'text', None),
                                                            ]
            else:
                print "CODE ERROR:", getattr(tds[0], 'text', None), ' FROM ', pre+file_name
    else:
        print '%s%s has no service code' % (pre, file_name)

In [51]:
for dir_path,subpaths,files in os.walk(top_dir,False):
    for file in files:
        file_path=os.path.join(dir_path,file)
        print "Start to process FILE: %s" % file
        extract_code(file_path, file)
        print "End!"


Start to process FILE: rpmdiff
CODE ERROR: GG-SSS-TT-MMM-Vx  FROM  https://docs.engineering.redhat.com/display/KB/rpmdiff
End!
Start to process FILE: MWC+Build+Tracker
CODE ERROR: GG-SSS-TT-MMM-Vx  FROM  https://docs.engineering.redhat.com/display/KB/MWC+Build+Tracker
End!
Start to process FILE: viewpage.action?pageId=11370551
End!
Start to process FILE: HSS+Confluence
End!
Start to process FILE: HTTP+Proxy
End!
Start to process FILE: HSS+Portal
End!
Start to process FILE: Front+Line+Support+Service+Catalog
https://docs.engineering.redhat.com/display/KB/Front+Line+Support+Service+Catalog has no service code
End!
Start to process FILE: Treasury
End!
Start to process FILE: SVN
End!
Start to process FILE: viewpage.action?pageId=15502269
CODE ERROR: CISSSSTT  FROM  https://docs.engineering.redhat.com/display/KB/viewpage.action?pageId=15502269
End!
Start to process FILE: qpid
End!
Start to process FILE: Cfengine
End!
Start to process FILE: HSS+Jira
End!
Start to process FILE: VM+Hosting
End!
Start to process FILE: HssPulp
CODE ERROR: GG-SSS-TT-MMM-Vx  FROM  https://docs.engineering.redhat.com/display/KB/HssPulp
End!
Start to process FILE: Laboratory+Engineering+Service+Catalog
https://docs.engineering.redhat.com/display/KB/Laboratory+Engineering+Service+Catalog has no service code
End!
Start to process FILE: Package+Wrangler
End!
Start to process FILE: MW+QE+Jenkins
End!
Start to process FILE: engops+ssl+signing+server
CODE ERROR: GG-SSS-TT-MMM-Vx  FROM  https://docs.engineering.redhat.com/display/KB/engops+ssl+signing+server
End!
Start to process FILE: Yum+repositories
End!
Start to process FILE: Conflux
https://docs.engineering.redhat.com/display/KB/Conflux has no service code
End!
Start to process FILE: Nagios
End!
Start to process FILE: viewpage.action?pageId=15502614
End!
Start to process FILE: Errata+Tool
End!
Start to process FILE: Cantas
End!
Start to process FILE: Beaker
End!
Start to process FILE: MWC+Proxy
End!
Start to process FILE: Shell+Servers
End!
Start to process FILE: GlobalSync+Service+Definition
End!
Start to process FILE: RHEV
End!
Start to process FILE: RHCS+HA+clusters
End!
Start to process FILE: Harvester+aka+Dataservice
End!
Start to process FILE: QE+wiki
End!
Start to process FILE: MWC+GIT
End!
Start to process FILE: Pride
End!
Start to process FILE: Stable+Systems
End!
Start to process FILE: viewpage.action?pageId=23462892
End!
Start to process FILE: index.html
https://docs.engineering.redhat.com/display/KB/index.html has no service code
End!
Start to process FILE: BRQ+Office+wiki
End!
Start to process FILE: MWC+Mailman
End!
Start to process FILE: Production+Databases
End!
Start to process FILE: viewpage.action?pageId=15502244
CODE ERROR: CR-009-01  FROM  https://docs.engineering.redhat.com/display/KB/viewpage.action?pageId=15502244
End!
Start to process FILE: viewpage.action?pageId=15500609
End!
Start to process FILE: viewpage.action?pageId=15502259
End!
Start to process FILE: Nexus
End!
Start to process FILE: MWC+Feeds
End!
Start to process FILE: Zanata
End!
Start to process FILE: Twoface
End!
Start to process FILE: OrgChart
End!
Start to process FILE: MWC+Magnolia
End!
Start to process FILE: DNS
End!
Start to process FILE: CI-RHEV
End!
Start to process FILE: IRC
End!
Start to process FILE: MWC+Kryten
End!
Start to process FILE: Dist+GIT
End!
Start to process FILE: download+and+rsyncd
CODE ERROR: GG-SSS-TT-MMM-Vx  FROM  https://docs.engineering.redhat.com/display/KB/download+and+rsyncd
End!
Start to process FILE: TRAC
End!
Start to process FILE: Maitai
End!
Start to process FILE: MWC+Filemgmt
End!
Start to process FILE: viewpage.action?pageId=15502610
End!
Start to process FILE: viewpage.action?pageId=15502624
End!
Start to process FILE: MWC+Puppet
End!
Start to process FILE: MWC+SMTP
https://docs.engineering.redhat.com/display/KB/MWC+SMTP has no service code
End!
Start to process FILE: Request+Tracker+-+RT
End!
Start to process FILE: VirtualDB
CODE ERROR:   FROM  https://docs.engineering.redhat.com/display/KB/VirtualDB
End!
Start to process FILE: TCMS
End!
Start to process FILE: MW+QE+Netapp
End!
Start to process FILE: RH+Bugzilla
End!
Start to process FILE: Vault
End!
Start to process FILE: viewpage.action?pageId=15501183
End!
Start to process FILE: viewpage.action?pageId=15501188
End!
Start to process FILE: viewpage.action?pageId=15502622
End!
Start to process FILE: Compose
End!
Start to process FILE: viewpage.action?pageId=15501238
End!
Start to process FILE: MWC+A-MQ
End!
Start to process FILE: viewpage.action?pageId=15502241
End!
Start to process FILE: viewpage.action?pageId=15502261
End!
Start to process FILE: MWC+SVN
End!
Start to process FILE: MWC+Fisheye
End!
Start to process FILE: Netapp
End!
Start to process FILE: MWC+MySQL+Database
End!
Start to process FILE: MWC+Jira
End!
Start to process FILE: Bugzilla+Metrics+and+bugbot
End!

In [59]:
outf = open('/home/weizhou/Projects/scripts/output/cr', 'w')
result_list = [u'|'.join(item) for item in code_dict.values()]
outf.write(u'\n'.join(result_list).encode('utf8'))
cr.close()
outf.close()