In [1]:

    
import re
import Levenshtein
from BeautifulSoup import BeautifulSoup

fls_1 = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-1', 'r')
fls_2 = open('/home/weizhou/Projects/scripts/input/service/fls-service-slice-2', 'r')
cr = open('/home/weizhou/Projects/scripts/input/service/cr', 'r')
lb = open('/home/weizhou/Projects/scripts/input/service/lb', 'r')

outf = open('/home/weizhou/Projects/scripts/output/service', 'w')

files = [fls_1,
         fls_2,
         cr,
         lb,
         outf
         ]



In [2]:

    
#
#Get all the Service
#
service = set()

# FLS service process 1
for line in fls_1:
    row_list = line.split(' | ')
    # cleaned the boring noisy data
    row_list = row_list[1:-2]
    service.update([str(fls_item.strip()) for fls_item in row_list])
        
# FLS service process 2
fls_2_content = fls_2.read()
fls_2_soup = BeautifulSoup(fls_2_content)
for line in fls_2_soup.findAll('tr'):
    td_content = line.td.contents[0].strip()
    if td_content != '&nbsp;':
        service.add(str(td_content))

#cr service process
cr_content = cr.read()
cr_soup = BeautifulSoup(cr_content)
for cr_table in cr_soup.findAll('div', 'table-wrap'):
    trs = cr_table.table.tbody.findAll('tr')
    #skip first 2lines
    trs = trs[2:]
    for tr in trs:
       service.add(str(tr.contents[1].a.contents[0].strip()))

        
#lb service process
lb_content = lb.read()
lb_soup = BeautifulSoup(lb_content)
trs = lb_soup.findAll("tr")
#skip first line
trs = trs[1:]
for tr in trs:
    lb_s = ''.join(tr.contents[1].findAll(text=True)).strip()
    service.add(str(lb_s))

service = [[0, item] for item in service]
length = len(service)

exclude_deduplicate = {'Software-Troubleshooting': 'Hardware-Troubleshooting',
           }

for i in xrange(0, length-1):
    if service[i][0] is 1:
        continue
    for j in xrange(i+1, length):
        if Levenshtein.ratio(service[i][1], service[j][1]) > 0.7:
            ed = exclude_deduplicate.get(service[i][1], None)
            if ed == service[j][1]:
                break
            service[i][1] = "%s , %s" % (service[i][1], service[j][1])
            service[j][0] = 1
            break
            
result = [item[1] for item in service if item[0] is 0]

deleted_map = {'Confluence , HSS Confluence': 'Confluence/Documentation',
               'PP (Product Pages)': 'PP',
               'Jira': 'HSS Jira',
               'GIT/Gerrit': 'Gerrit/Git',
               }

for key, value in deleted_map.iteritems():
    result.remove(key)
    result.remove(value)
    result.append('%s , %s' % (key, value))

    
count = xrange(0, len(result))
#id_service = ["|%s|%s|" % (key+1, ' , '.join(map(lambda x: '[KB:'+x+']', value.split(' , ')))) for (key, value) in zip(count, result)]
id_service = ["%s|%s" % (key+1, value) for (key, value) in zip(count, result)]
outf.write(str('\n'.join(id_service)))