In [47]:
import os
from BeautifulSoup import BeautifulSoup
import re
cr = open('/home/weizhou/Projects/scripts/input/service/cr', 'r')
top_dir = '/home/weizhou/Projects/scripts/input/cache/'
pre = 'https://docs.engineering.redhat.com/display/KB/'
service_code_reg = re.compile(r'(FS|CR|LB)-[\d]{3}-[\d]{2}-[\d]{3}-v[\d]{1}')
#1 Engineering Operations - CORE
#2 Engineering Operations - LAB
#3 Engineering Operations - FLS
team_dict = {'CR': '1',
'LB': '2',
'FS': '3',
'CA': '4',
'SA': '5',
'SE': '6',
}
In [48]:
cr_service_dict = dict()
#cr service process
cr_content = cr.read()
cr_soup = BeautifulSoup(cr_content)
for cr_table in cr_soup.findAll('div', 'table-wrap'):
trs = cr_table.table.tbody.findAll('tr')
#skip first 2lines
trs = trs[2:]
for tr in trs:
if tr.contents[0].string:
if tr.contents[0].string == u' ':
continue
code = tr.contents[0].string.strip()
service_name = tr.contents[1].a.contents[0].strip()
elif tr.contents[0].strong:
code = tr.contents[0].strong.string.strip()
service_name = tr.contents[1].strong.a.contents[0].strip()
elif tr.contents[0].p:
code = tr.contents[0].p.string.strip()
service_name = tr.contents[1].p.a.contents[0].strip()
else:
continue
cr_service_dict[code.split('-')[1]] = service_name
In [49]:
code_dict = dict()
In [50]:
def extract_code(html_file, file_name):
with open(html_file) as inf:
content = inf.read()
soup = BeautifulSoup(content)
data = soup.find('h3', text='Breakdown of tasks performed')
if data:
data = data.parent.nextSibling
while data:
if data.table:
break
data = data.nextSibling
data = data.table
if not data:
print '%s%s has no service code' % (pre, file_name)
return
data =data.findAll('tr')
if len(data) <= 1:
print '%s%s has no service code' % (pre, file_name)
return
for tr in data[1:]:
tds = tr.findAll('td')[0:2]
code_list = getattr(tds[0], 'text', None).split('-')
if len(code_list) >= 4:
if code_list[1] == 'SSS':
print "CODE ERROR:", getattr(tds[0], 'text', None), ' FROM ', pre+file_name
continue
if cr_service_dict.has_key(code_list[1]) is False:
continue
code_dict[getattr(tds[0], 'text', None)] = [getattr(tds[0], 'text', None),
cr_service_dict[code_list[1]],
team_dict[code_list[0]],
code_list[3],
getattr(tds[1], 'text', None),
]
else:
print "CODE ERROR:", getattr(tds[0], 'text', None), ' FROM ', pre+file_name
else:
print '%s%s has no service code' % (pre, file_name)
In [51]:
for dir_path,subpaths,files in os.walk(top_dir,False):
for file in files:
file_path=os.path.join(dir_path,file)
print "Start to process FILE: %s" % file
extract_code(file_path, file)
print "End!"
In [59]:
outf = open('/home/weizhou/Projects/scripts/output/cr', 'w')
result_list = [u'|'.join(item) for item in code_dict.values()]
outf.write(u'\n'.join(result_list).encode('utf8'))
cr.close()
outf.close()