In [1]:
import re
import Levenshtein
from BeautifulSoup import BeautifulSoup
fls_1 = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-1', 'r')
fls_2 = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-2', 'r')
cr = open('/home/weizhou/Projects/scripts/input/service/cr', 'r')
lb = open('/home/weizhou/Projects/scripts/input/service/lb', 'r')
outf = open('/home/weizhou/Projects/scripts/output/service', 'w')
files = [fls_1,
fls_2,
cr,
lb,
outf
]
In [2]:
#
#Get all the Service
#
service = set()
# FLS service process 1
for line in fls_1:
row_list = line.split(' | ')
# cleaned the boring noisy data
row_list = row_list[1:-2]
service.update([str(fls_item.strip()) for fls_item in row_list])
# FLS service process 2
fls_2_content = fls_2.read()
fls_2_soup = BeautifulSoup(fls_2_content)
for line in fls_2_soup.findAll('tr'):
td_content = line.td.contents[0].strip()
if td_content != ' ':
service.add(str(td_content))
#cr service process
cr_content = cr.read()
cr_soup = BeautifulSoup(cr_content)
for cr_table in cr_soup.findAll('div', 'table-wrap'):
trs = cr_table.table.tbody.findAll('tr')
#skip first 2lines
trs = trs[2:]
for tr in trs:
service.add(str(tr.contents[1].a.contents[0].strip()))
#lb service process
lb_content = lb.read()
lb_soup = BeautifulSoup(lb_content)
trs = lb_soup.findAll("tr")
#skip first line
trs = trs[1:]
for tr in trs:
lb_s = ''.join(tr.contents[1].findAll(text=True)).strip()
service.add(str(lb_s))
service = [[0, item] for item in service]
length = len(service)
exclude_deduplicate = {'Software-Troubleshooting': 'Hardware-Troubleshooting',
}
for i in xrange(0, length-1):
if service[i][0] is 1:
continue
for j in xrange(i+1, length):
if Levenshtein.ratio(service[i][1], service[j][1]) > 0.7:
ed = exclude_deduplicate.get(service[i][1], None)
if ed == service[j][1]:
break
service[i][1] = "%s , %s" % (service[i][1], service[j][1])
service[j][0] = 1
break
result = [item[1] for item in service if item[0] is 0]
deleted_map = {'Confluence , HSS Confluence': 'Confluence/Documentation',
'PP (Product Pages)': 'PP',
'Jira': 'HSS Jira',
'GIT/Gerrit': 'Gerrit/Git',
}
for key, value in deleted_map.iteritems():
result.remove(key)
result.remove(value)
result.append('%s , %s' % (key, value))
count = xrange(0, len(result))
#id_service = ["|%s|%s|" % (key+1, ' , '.join(map(lambda x: '[KB:'+x+']', value.split(' , ')))) for (key, value) in zip(count, result)]
id_service = ["%s|%s" % (key+1, value) for (key, value) in zip(count, result)]
outf.write(str('\n'.join(id_service)))
In [3]:
for f_item in files:
f_item.close()