In [61]:
import re
from BeautifulSoup import BeautifulSoup

service = open('/home/weizhou/Projects/scripts/output/service', 'r')
fls = open('/home/weizhou/Projects/scripts/input/task/fls', 'r')
fls_service_inf = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-1', 'r')
lb_service = open('/home/weizhou/Projects/scripts/input/service/lb_for_3_task_extract', 'r')
lb_task = open('/home/weizhou/Projects/scripts/input/task/lb', 'r')
cr_task = open('/home/weizhou/Projects/scripts/output/cr', 'r')

outf= open('/home/weizhou/Projects/scripts/output/task', 'w')

FILES = (service,
         fls,
         fls_service_inf,
         lb_service,
         lb_task,
         cr_task,
         outf,
         )

In [62]:
#
# Get all the tasks
#

service_code_reg = re.compile(r'(FS|CR|LB)-[\d]{3}-[\d]{2}-[\d]{3}-v[\d]{1}')

#1	Engineering Operations - CORE
#2	Engineering Operations - LAB
#3	Engineering Operations - FLS
team_dict = {'CR': '1',
             'LB': '2',
             'FS': '3',
             'CA': '4',
             'SA': '5',
             'SE': '6',
             }

service_dict = dict()
for item in service:
    item = item.strip()
    for element in item.split('|')[1].split(' , '):
        service_dict[element] = item.split('|')[0]

current_service_id = None
task_dict = dict()

# FLS service process 1
fls_services = []
fls_row = fls_service_inf.read().strip()
# cleaned the boring noisy data
fls_services = fls_row.split(' | ')[1:-2]

# FLS task process
fls_content = fls.read()
fls_soup = BeautifulSoup(fls_content)
fls_tables = fls_soup.findAll('div', 'table-wrap')

#Process tables[1:-2]
if len(fls_services) != len(fls_tables)-3:
    raise Exception('ERROR: service can not be aligned to fls tables')
for fls_service, fls_table in zip(fls_services, fls_tables[1:-2]):
    current_service_id = service_dict[fls_service]

    trs = fls_table.table.tbody.findAll('tr')
    #Skip first line
    trs = trs[1:]
    for tr in trs:
        if tr.contents[0].string:
            if tr.contents[0].string == u' ':
                continue
            code = tr.contents[0].string.strip()
        else:
            code = tr.contents[0].p.string.strip()
        if service_code_reg.match(code) is None:
            continue
        if tr.contents[1].string and tr.contents[1].string != u' ':
            desc = tr.contents[1].string.strip()
        else:
            desc = 'NA'
        if tr.contents[2].a:
            link = tr.contents[2].a['href']
        else:
            link = 'NA'
        task_dict[code] = [current_service_id, 
                           team_dict[code.split('-')[0]] if code else'NA',
                           code.split('-')[3] if code else'NA',
                           desc,
                           link]

#Process tables[-2]
trs = fls_tables[-2].table.tbody.findAll('tr')
trs = trs[1:]
for tr in trs:
    if len(tr.contents) is not 4:
        raise Exception('ERROR FORMAT', line)
    fls_service = tr.td.string.strip()
    if fls_service and fls_service != u' ':
        current_service_id = service_dict[fls_service]
    if tr.contents[1].string:
        if tr.contents[1].string == u' ':
            continue
        code = tr.contents[1].string.strip()
    else:
        code = tr.contents[1].p.string.strip()
    if service_code_reg.match(code) is None:
        continue
    if tr.contents[2].string and tr.contents[2].string != u' ':
        desc = tr.contents[2].string.strip()
    else:
        desc = 'NA'
    if tr.contents[3].a:
        link = tr.contents[3].a['href']
    else:
        link = 'NA'
    task_dict[code] = [current_service_id,
                       team_dict[code.split('-')[0]] if code else'NA',
                       code.split('-')[3] if code else'NA',
                       desc,
                       link]


#lb task process
for line in lb_service:
    row_list = line.split('\t')
    #LB table format
    #Service ID \t Service Name
    if len(row_list) is not 2:
        raise Exception('ERROR FORMAT', line)
    service_dict[row_list[0].split('-')[1]] = service_dict[row_list[1].strip()]

lb_content = lb_task.read()
lb_soup = BeautifulSoup(lb_content)
lb_tables = lb_soup.findAll('div', 'table-wrap')

#skip first 2table
lb_tables = lb_tables[2:]

for lb_table in lb_tables:
    trs = lb_table.tbody.findAll('tr')
    for tr in trs:
        if tr.contents[0].string:
            if tr.contents[0].string == u' ':
                continue
            code = tr.contents[0].string.strip()
        else:
            continue
        if service_code_reg.match(code) is None:
            continue
        if tr.contents[1].string and tr.contents[1].string != u' ':
            desc = tr.contents[1].string.strip()
        else:
            desc = 'NA'
        if len(tr.contents) == 6 and tr.contents[5].a:
            link = tr.contents[5].a['href']
        else:
            link = 'NA'
        task_dict[code] = [service_dict[code.split('-')[1]], 
                           team_dict[code.split('-')[0]],
                           code.split('-')[3],
                           desc,
                           link]


#CR task process
cr_task = cr_task.read().decode('utf8')
for line in cr_task.split(u'\n'):
    tds = line.strip().split(u'|')
    task_dict[tds[0]] = [service_dict[tds[1]],
                       tds[2],
                       tds[3],
                       tds[4] if tds[4].strip() else u'NA',
                       u'NA',
                       ]

# clean up
NO_MATCHING = u'NO MATCHING TASK AVAILABLE'
result = [item for item in task_dict.values() if item[3].strip() != NO_MATCHING]
count = xrange(0, len(result))
id_task = ["|%s|%s|" % (key+1, '|'.join(value)) for (key, value) in zip(count, result)]
title = u"||ID||Service ID||Responsible Team ID||Cost||Description||Detail Link||\n"
result = u'\n'.join(id_task)
result = title + result
outf.write(result.encode('utf8'))

In [63]:
for F in FILES:
    F.close()