In [61]:
import re
from BeautifulSoup import BeautifulSoup
service = open('/home/weizhou/Projects/scripts/output/service', 'r')
fls = open('/home/weizhou/Projects/scripts/input/task/fls', 'r')
fls_service_inf = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-1', 'r')
lb_service = open('/home/weizhou/Projects/scripts/input/service/lb_for_3_task_extract', 'r')
lb_task = open('/home/weizhou/Projects/scripts/input/task/lb', 'r')
cr_task = open('/home/weizhou/Projects/scripts/output/cr', 'r')
outf= open('/home/weizhou/Projects/scripts/output/task', 'w')
FILES = (service,
fls,
fls_service_inf,
lb_service,
lb_task,
cr_task,
outf,
)
In [62]:
#
# Get all the tasks
#
service_code_reg = re.compile(r'(FS|CR|LB)-[\d]{3}-[\d]{2}-[\d]{3}-v[\d]{1}')
#1 Engineering Operations - CORE
#2 Engineering Operations - LAB
#3 Engineering Operations - FLS
team_dict = {'CR': '1',
'LB': '2',
'FS': '3',
'CA': '4',
'SA': '5',
'SE': '6',
}
service_dict = dict()
for item in service:
item = item.strip()
for element in item.split('|')[1].split(' , '):
service_dict[element] = item.split('|')[0]
current_service_id = None
task_dict = dict()
# FLS service process 1
fls_services = []
fls_row = fls_service_inf.read().strip()
# cleaned the boring noisy data
fls_services = fls_row.split(' | ')[1:-2]
# FLS task process
fls_content = fls.read()
fls_soup = BeautifulSoup(fls_content)
fls_tables = fls_soup.findAll('div', 'table-wrap')
#Process tables[1:-2]
if len(fls_services) != len(fls_tables)-3:
raise Exception('ERROR: service can not be aligned to fls tables')
for fls_service, fls_table in zip(fls_services, fls_tables[1:-2]):
current_service_id = service_dict[fls_service]
trs = fls_table.table.tbody.findAll('tr')
#Skip first line
trs = trs[1:]
for tr in trs:
if tr.contents[0].string:
if tr.contents[0].string == u' ':
continue
code = tr.contents[0].string.strip()
else:
code = tr.contents[0].p.string.strip()
if service_code_reg.match(code) is None:
continue
if tr.contents[1].string and tr.contents[1].string != u' ':
desc = tr.contents[1].string.strip()
else:
desc = 'NA'
if tr.contents[2].a:
link = tr.contents[2].a['href']
else:
link = 'NA'
task_dict[code] = [current_service_id,
team_dict[code.split('-')[0]] if code else'NA',
code.split('-')[3] if code else'NA',
desc,
link]
#Process tables[-2]
trs = fls_tables[-2].table.tbody.findAll('tr')
trs = trs[1:]
for tr in trs:
if len(tr.contents) is not 4:
raise Exception('ERROR FORMAT', line)
fls_service = tr.td.string.strip()
if fls_service and fls_service != u' ':
current_service_id = service_dict[fls_service]
if tr.contents[1].string:
if tr.contents[1].string == u' ':
continue
code = tr.contents[1].string.strip()
else:
code = tr.contents[1].p.string.strip()
if service_code_reg.match(code) is None:
continue
if tr.contents[2].string and tr.contents[2].string != u' ':
desc = tr.contents[2].string.strip()
else:
desc = 'NA'
if tr.contents[3].a:
link = tr.contents[3].a['href']
else:
link = 'NA'
task_dict[code] = [current_service_id,
team_dict[code.split('-')[0]] if code else'NA',
code.split('-')[3] if code else'NA',
desc,
link]
#lb task process
for line in lb_service:
row_list = line.split('\t')
#LB table format
#Service ID \t Service Name
if len(row_list) is not 2:
raise Exception('ERROR FORMAT', line)
service_dict[row_list[0].split('-')[1]] = service_dict[row_list[1].strip()]
lb_content = lb_task.read()
lb_soup = BeautifulSoup(lb_content)
lb_tables = lb_soup.findAll('div', 'table-wrap')
#skip first 2table
lb_tables = lb_tables[2:]
for lb_table in lb_tables:
trs = lb_table.tbody.findAll('tr')
for tr in trs:
if tr.contents[0].string:
if tr.contents[0].string == u' ':
continue
code = tr.contents[0].string.strip()
else:
continue
if service_code_reg.match(code) is None:
continue
if tr.contents[1].string and tr.contents[1].string != u' ':
desc = tr.contents[1].string.strip()
else:
desc = 'NA'
if len(tr.contents) == 6 and tr.contents[5].a:
link = tr.contents[5].a['href']
else:
link = 'NA'
task_dict[code] = [service_dict[code.split('-')[1]],
team_dict[code.split('-')[0]],
code.split('-')[3],
desc,
link]
#CR task process
cr_task = cr_task.read().decode('utf8')
for line in cr_task.split(u'\n'):
tds = line.strip().split(u'|')
task_dict[tds[0]] = [service_dict[tds[1]],
tds[2],
tds[3],
tds[4] if tds[4].strip() else u'NA',
u'NA',
]
# clean up
NO_MATCHING = u'NO MATCHING TASK AVAILABLE'
result = [item for item in task_dict.values() if item[3].strip() != NO_MATCHING]
count = xrange(0, len(result))
id_task = ["|%s|%s|" % (key+1, '|'.join(value)) for (key, value) in zip(count, result)]
title = u"||ID||Service ID||Responsible Team ID||Cost||Description||Detail Link||\n"
result = u'\n'.join(id_task)
result = title + result
outf.write(result.encode('utf8'))
In [63]:
for F in FILES:
F.close()