In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os,sys
#import csv
import pandas as pan
import cPickle as pickle
import pprint
#import glob
#import tables #PyTables used to generate HDF5 file instead of pickle
%matplotlib inline
Companies House data set used as a list of source companies which could be of interest for B2B lead generation. Obtain the data set here: http://download.companieshouse.gov.uk/en_output.html
First we read the data set into a Pandas DataFrame and serialise it into a pickle file.
In [2]:
rootdir="/home/ilan/Desktop/GI_interview_project"
datadir="/home/ilan/Desktop/GI_interview_project/company_data"
os.chdir(datadir)
pklfile="data.pkl"
#hffile="data.h5"
folderpath=os.path.join(datadir,pklfile)
#folderpath=os.path.join(rootdir,hffile)
if (os.path.exists(folderpath)==True):
print("Pickle file containing data found. Loading it...")
data=pickle.load(open(folderpath,'r'))
#data = tables.open_file(folderpath, driver="H5FD_CORE")
else:
print("Reading in csv file and creating pickle...")
filenames =['BasicCompanyData-2015-05-01-part1_5.csv', 'BasicCompanyData-2015-05-01-part2_5.csv',\
'BasicCompanyData-2015-05-01-part3_5.csv', 'BasicCompanyData-2015-05-01-part4_5.csv',\
'BasicCompanyData-2015-05-01-part5_5.csv']
list_ = []
# for i,j in enumerate(filenames):
# if (i == 0):
# data = pan.read_csv(j, delimiter=',',index_col=False)
# list_.append(data)
# print data.head(1)
# elif (i > 0):
# data = pan.read_csv(j, delimiter=',',skiprows=1,index_col=False)
# list_.append(data)
# print data.head(1)
# data = pan.concat(list_)
for i in filenames:
data = pan.read_csv(i, delimiter=',',index_col=False)
list_.append(data)
#print data.head(1)
data = pan.concat(list_)
# Remove dots and whitespaces from column titles
colnames = [str(i).replace('.','_').strip() for i in list(data.columns.values)]
data.columns=colnames
# Remove period in the label column
#data['Label']=data['Label'].apply(lambda x: x.strip('.'))
with open(pklfile,'wb') as output:
pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)
data
Out[2]:
To get a feel for the data set we do some basic data set exploration.
In [3]:
print data.columns
print data.size
data.describe()
Out[3]:
In [4]:
# All the labels in the data, and their counts
categorycounts=data['CompanyCategory'].value_counts()
print categorycounts
categorycounts.plot(kind='bar')
Out[4]:
In [5]:
# All the labels in the data, and their counts
statuscounts=data['CompanyStatus'].value_counts()
print statuscounts
statuscounts.plot(kind='bar')
Out[5]:
In [6]:
class Mask(object):
def __init__(self,df,field,match):
self.df = df
self.field = field
self.match = match
self.function = lambda x, y, z: x.loc[x[y] == z]
def __call__(self):
return self.function(self.df,self.field,self.match)
#return self.df.loc[self.df[self.field] == self.match]
#data[data.CompanyName == "! LTD"]
#data.loc[data["CompanyName"] == "! LTD"]
result = Mask(data, "CompanyName", "! LTD")
print result()
In [7]:
class booleanMask(object):
def __init__(self,function):
self.function = function
#def __and__(self,other):
# self.function = self.function & other.function
def __call__(self,df):
self.df = df
return map(self.function, [self.df])[0]
company_mask = booleanMask(lambda x: x.CompanyName == "! LTD")
##print company_mask(data)
print data[company_mask(data)]
# MASKS CAN NOW BE COMBINED
#uk_mask = booleanMask(lambda x: x.RegAddress_Country == "UNITED KINGDOM")
#active_mask = booleanMask(lambda x: x.CompanyStatus == "Active")
#print data[uk_mask(data) & active_mask(data)]
# FOR VALIDATION TO MAKE SURE BOOLEANMASK IS GIVING WHAT WE EXPECT
#data.loc[(data["RegAddress_Country"] == "UNITED KINGDOM") & (data["CompanyStatus"] == "Active")]
#print len(data.loc[(data["RegAddress_Country"] == "UNITED KINGDOM") & (data["CompanyStatus"] == "Active")])
#print len(data[uk_mask(data) & active_mask(data)])
#print map(lambda x: x.CompanyName == "! LTD", [data])
In [8]:
# DEFINE A REDUCED DATASET FOR PROTOTYPING
from random import sample
# number/fraction of entries to use
#ents = int(len(X)*0.1)
ents = 100
# Take a random sample from the data
smalldataind = sample(range(0,len(data)-1),ents)
#print smalldataind
# HERE
#smalldataind = [784400, 333248, 3037529, 333413, 1851904, 1569996, 2958604, 769824, 2848095, 896580]
smalldata = data.iloc[smalldataind]
smalldata
Out[8]:
In [9]:
# svn checkout http://pygoogle.googlecode.com/svn/trunk pygoogle-read-only
# python setup.py build
# sudo python setup.py install
from pygoogle import pygoogle
from time import sleep
from pprint import pprint
#g = pygoogle('! LTD company')
#g.pages = 1
#print '*Found %s results*'%(g.get_result_count())
#g.get_urls()
#print list(smalldata['CompanyName'].values)
compnames = list(smalldata['CompanyName'].values)
#compadds = list(smalldata['RegAddress_AddressLine1'].values)
compadds = list(smalldata['RegAddress_PostCode'].values)
#compadds = [i.split(' ')[0] for i in list(smalldata['RegAddress_PostCode'].values)]
#urls = []
#counter = 0
#for i,j in zip(compnames,compadds):
# g = pygoogle(i+' contact '+j)
# g.pages = 1
# urls.append(g.get_urls())
# counter += 1
# sleep(np.random.uniform(5,10))
#print urls
os.chdir(datadir)
urlpklfile="URLs.pkl"
urlfolderpath=os.path.join(datadir,urlpklfile)
if (os.path.exists(urlfolderpath)==True):
print("Pickle file containing URL data found. Loading it...")
urls=pickle.load(open(urlfolderpath,'r'))
else:
print("Fetching company URLs from Google...")
urls = []
counter = 0
for i,j in zip(compnames,compadds):
g = pygoogle(i+' contact '+j)
g.pages = 1
urls.append(g.get_urls())
if (counter % 10 == 0):
with open(urlpklfile,'wb') as output:
pickle.dump(urls, output, pickle.HIGHEST_PROTOCOL)
counter += 1
sleep(np.random.uniform(5,10))
with open(urlpklfile,'wb') as output:
pickle.dump(urls, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)
#urls = [[u'https://www.facebook.com/andrea.shaw.564', u'https://www.facebook.com/dianne.schultz1', u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/', u'https://classictvhistory.wordpress.com/tag/have-gun-will-travel/', u'http://i.dujour.com/december-print/', u'http://www.greenvillecountybar.org/Gbar_News_PDF/2014/122014.pdf', u'http://dartmouthalumnimagazine.com/class-notes/1970/all', u'http://www.dls.org/pdf/magazine/october_2007_magazine.pdf'], [u'http://www.city-data.com/clackamas-county/D/Delenka-Lane-2.html', u'http://law.justia.com/cases/alaska/supreme-court/2011/', u'https://www.facebook.com/htmody', u'https://www.facebook.com/terry.meyers.5', u'http://www.ciwf.com/media/1141326/outofsight-full-report.pdf', u'http://www.losfoundation.org/wp-content/uploads/2013/06/Donors-2011_2012.pdf', u'http://svcf.org/help/recognition/', u'https://www.ipo.gov.uk/t-tmj/tm-journals/2015-007/owner.html'], [u'https://www.sc.com/uk/contact-us/', u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20130905d.pdf', u'http://www.aim25.ac.uk/cgi-bin/vcdf/detail?coll_id=18442&inst_id=118&nv1=search&nv2=', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://www.hkexnews.hk/listedco/listconews/sehk/2015/0519/LTN20150519338.pdf', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf'], [u'https://www.facebook.com/theoldglovefactorymarketplace', u'https://www.grinnell.edu/about/visit/spaces/old-glove-factory', u'http://en.wikipedia.org/wiki/GlaxoSmithKline', u'http://www.dailykos.com/story/2013/01/06/1163848/-KosAbility-Trying-to-Clean-Out-an-Old-House-with-Arthritis-and-Asthma', u'http://www.cdc.gov/NCEH/publications/books/housing/cha05.htm', u'http://www.slideshare.net/MedlineIndustriesInc/surgical-gloves-a-comprehensive-guide', u'http://www.cpsc.gov/pagefiles/112284/5015.pdf', u'http://ftp.asahq.org/publicationsAndServices/latexallergy.pdf'], [u'http://www.thegsa.co.za/index.php?nav=destination_country&view=28', u'https://www.facebook.com/anna.brass1'], [], [u'http://books.openedition.org/obp/326', u'http://www.hrblock.com/tax-offices/local-offices/#!/en/office-profile/12546', u'http://www.caicv.org/dev/data/fckeditor/cms/file/Quorum_July2010WEB.pdf', u'https://play.google.com/store/apps/details?id=com.mhriley.spendingtracker&hl=en', u'https://www.facebook.com/walter.kajer.1', u'http://duchyofcornwall.org/assets/images/documents/Poundbury_Factsheet_2013.pdf', u'http://www.lihp.org/Content/2011 annual report.pdf', u'http://www.kildare.ie/business/directory/list-companies.asp?Category=Business Services'], [u'http://cera.govt.nz/sites/default/files/common/tc3-residential-rebuild-booklet-A4-20121204.pdf', u'http://www.thomsonlocal.com/Funeral-Directors/in/Surrey/', u'http://www.britishculinaryfederation.co.uk/bcf/wp-content/uploads/2011/06/091124_Culinary_News_December_v6.pdf', u'http://www.hackney.gov.uk/Assets/Documents/ht276.pdf', u'http://www.insightpublications.com.au/pdf_preview/isp-julius-caesar-10-pages.pdf', u'http://www.tripadvisor.co.uk/Hotel_Review-g191252-d491974-Reviews-Trimstone_Manor_Country_House_Hotel-Ilfracombe_Devon_England.html', u'http://www.lincoln.ac.nz/Documents/LEaP/WMK ICRF Final May 2013.pdf', u'http://delvinvillage.com/directory/'], [u'http://www.deloitte.com/', u'http://www.schencksc.com/2015rpctour/', u'http://www.schencksc.com/2013recforum/', u'https://www.linkedin.com/in/jeffreyshlefstein', u'http://www.aicpa.org/BecomeACPA/Pages/InternshipsandCooperativePrograms.aspx', u'http://www.freshbooks.com/accountants/map', u'http://www.mncpa.org/find-a-cpa/cpa-yellow-pages/list.aspx?l=c', u'http://cdn.colorado.gov/cs/Satellite?blobcol=urldata&blobheadername1=Content-Disposition&blobheadername2=Content-Type&blobheadervalue1=inline;+filename="March+28,+2007+Board+Meeting+Minutes.pdf"&blobheadervalue2=application/pdf&blobkey=id&blobtable=MungoBlobs&blobwhere=1251832310203&ssbinary=true'], []]
#urls =[[u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/', u'http://www.ucl.ac.uk/consultants/homepage'], [u'http://www.contactps.ca/', u'https://411.ca/business/profile/7759616'], [u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20090902a.pdf', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://vpr.hkma.gov.hk/pdf/100269/fd_int/fd_int_0613_pt01.pdf', u'http://www.fogl.com/fogl/uploads/companypresentations/annual_report_2012.pdf'], [], [u'https://openaccess.adb.org/bitstream/handle/11540/1651/Volume 28_No 2_2011_06.pdf?sequence=1', u'http://yourtireshopsupply.com/manufacturer/27/grey-pneumatic-corp', u'https://www.facebook.com/people/\xe0\xb8\xa8\xe0\xb8\xb4\xe0\xb8\xa3\xe0\xb8\xb4\xe0\xb8\xa3\xe0\xb8\xb1\xe0\xb8\x95\xe0\xb8\x99\xe0\xb9\x8c-\xe0\xb8\x97\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa7\xe0\xb8\xb1\xe0\xb8\x92\xe0\xb8\x99\xe0\xb9\x8c/100004117395751', u'https://th-th.facebook.com/donnapa.apple', u'https://www.facebook.com/sasesopit.muttamara', u'https://th-th.facebook.com/KLShopbymarie', u'https://th-th.facebook.com/soraya.lomsungnoen.1', u'https://th-th.facebook.com/namthip.bunthong.7'], [u'http://agra-alliance.org/download/53396d7f2a934/', u'https://www.africare.org/wp-content/uploads/2014/08/AFSRNo4_BrysonEley_SuccessStoryGuide_Final_Jan7_2008_updated_June08.pdf'], [u'https://www.clearbooks.co.uk/directory/business', u'https://www.tapa.co.uk/the-tapa-opt-out-ledger.php', u'http://www.dailymail.co.uk/health/article-1330839/Blundering-doctors-leave-mother-terrified-falsely-diagnosing-brain-haemorrhage.html'], [u'http://www.priorygroup.com/location-results/item/the-priory-hospital-glasgow', u'http://www.yell.com/biz/1st-choice-plumbing-and-heating-glasgow-901468909/', u'https://www2.deloitte.com/content/dam/Deloitte/global/Documents/Consumer-Business/gx-cb-global-powers-of-retailing.pdf', u'http://www.rightmove.co.uk/property-for-sale/property-30497721.html', u'http://www.hazelwood.glasgow.sch.uk/', u'https://plus.google.com/+Paranetuklimited', u'http://www.kinningparkcomplex.org/projects-overview/bike-project/', u'https://www.glasgow.gov.uk/CHttpHandler.ashx?id=14911&p=0'], [u'http://www.scleeaccountant.com/', u'http://www.192.com/places/sk/sk8-1/sk8-1nq/', u'https://www.icpas.org/hc-career-center.aspx?id=21550', u'https://www.linkedin.com/pub/leona-crouch/26/b42/b17', u'http://www.burkertvaluation.com/wp-content/uploads/2014/04/Rpb-Vitae_General.pdf', u'http://www.alec.co.uk/cvtips/examgrcv.htm', u'http://www.chaos.umd.edu/misc/origplates.html', u'http://www.atiner.gr/bio/Syrrakos.doc'], [u'https://uk.linkedin.com/pub/david-wasilewski/27/143/368']]
# TO USE A HAND-PICKED SET OF URLS TO AVOID REPEAT REQUESTS TO GOOGLE, WHICH GET YOU BLOCKED
urls = [[u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/'], [u'http://www.plantmethods.com/content/10/October/2014', u'http://www.plantmethods.com/content?page=2&itemsPerPage=25'], [u'https://www.sc.com/uk/contact-us/', u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20130905d.pdf', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20140520b.pdf', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf', u'http://www.hkexnews.hk/listedco/listconews/sehk/2015/0519/LTN20150519338.pdf'], [u'http://www.nhs.uk/Services/Trusts/Pharmacies/DefaultView.aspx?id=89768', u'http://www.boots.com/'], [], [], [u'https://www.xero.com/', u'http://www.sage.com/'], [u'http://www.mastercard.us/', u'http://www.baxterstorey.co.uk/'], [u'http://www.192.com/places/sk/sk8-1/sk8-1nq/', u'http://www.ey.com/', u'http://www.grantthornton.com/'], []]
#print len(urls)
#pprint(urls)
#filteredurls = urls[:]
#for count,i in enumerate(filteredurls[:]):
# for j in i:
# print j
# if ('contact' not in j):
# filteredurls[count].remove(j)
# print "NOT FOUND"
#print j
#print filteredurls[count]
#print filteredurls
# This one exceeds maximum recursion
#def empty(seq):
# try:
# return all(map(empty, seq))
# except TypeError:
# return False
def empty(seq):
"""Check if a nested list (list of lists) is completely empty, if so return 'True'"""
containslist = []
for i in range(0,len(seq)-1):
if seq[i]:
containslist.append(False)
else:
containslist.append(True)
if (False in containslist):
return False
else:
return True
def filtering(initem):
""" Check if string 'contact' is in URL, if so split by it and keep first part, else return empty list"""
if ('contact' in initem):
return initem.split('contact')[0]
else:
return []
filteredurls = [np.nan]*len(urls)
for i in range(0,len(urls)-1):
filteredurls[i] = [filtering(j) for j in urls[i]]
if empty(filteredurls[i]):
#if not filteredurls[i]:
filteredurls[i] = np.nan
#pprint(filteredurls)
#filteredurls = urls[:]
#for i,j in enumerate(urls):
# toremove = [k for k in urls[i] if 'contact' not in urls[i]]
# for l in j:
# if(j in toremove):
# filteredurls[i].remove(j)
#print filteredurls
d = {'CompanyName' : pan.Series(compnames), 'CompanyAddress1' : pan.Series(compadds), 'URLs' : pan.Series(filteredurls)}
dfurls = pan.DataFrame(d)
dfurls
#urls = [pygoogle(i).get_urls()[0] for i in list(smalldata['CompanyName'].values)]
#print urls
#smalldata['WebURL'] = Series([pygoogle(i).get_urls()[0] for i in data['CompanyName']], index=smalldata.index)
#compnames = smalldata.iterrows()[1]
#print compnames
#for i in range(0,len(smalldata)-1):
Out[9]:
In [10]:
import re
from mechanize import Browser
# http://stackoverflow.com/questions/1011975/how-to-get-links-on-a-webpage-using-mechanize-and-open-those-links
def findAboutUs(inputlink):
"""Given an initial (hopefully, homepage) URL, look for an 'About Us' link, if not found just return initial URL."""
if (inputlink == np.nan):
return np.nan
#print inputlink
br = Browser()
br.open(inputlink)
aboutuslinks = []
# br.links(url_regex="about")
# br.links(text_regex="About( us)?")
for link in br.links(text_regex="About"):
#print inputlink, link.url
aboutuslinks.append(link)
#br.follow_link(link) # takes EITHER Link instance OR keyword args
#br.back()
#print aboutuslinks
# http://stackoverflow.com/questions/10994251/mechanize-urllib-beautifulsoup-relative-paths
for i,j in enumerate(aboutuslinks):
"""Mechanize often returns relative links, split into .base_url and .url We join them -if necessary- here."""
domain = re.search('(http:\/\/.*\.\D+?|https:\/\/.*\.\D+?)\/',j.base_url.strip())
if domain:
domain = domain.group(1)
if re.search('mailto',j.url.strip()) != None:
pass
elif re.search('(http:\/\/.*\.\D+?|https:\/\/.*\.\D+?)\/',j.url.strip()) != None:
u = j.url.strip()#.encode('utf8')
elif re.search('^/',j.url.strip()) != None:
u = domain+j.url.strip()#.encode('utf8')
else:
u = domain+'/'+j.url.strip()#.encode('utf8')
aboutuslinks[i] = u
# Some non-About Us links somehow still make it here, filter them out by requiring an 'about' in the URL
#print aboutuslinks
aboutuslinks = [i for i in aboutuslinks if 'about' in i]
#print aboutuslinks
# If multiple 'About Us' links found (sometimes duplicates), take the first one only
if (aboutuslinks and isinstance(aboutuslinks, list)):
aboutuslink = aboutuslinks[0]
else:
aboutuslink = aboutuslinks
# If no 'About us' link is found return initial (input) link
if aboutuslink:
return aboutuslink
else:
return inputlink
#print findAboutUs("https://www.sc.com/uk/")
print findAboutUs("http://www.growthintel.com")
In [11]:
#from lxml import html
#import requests
#page = requests.get('https://www.sc.com/uk/')
#tree = html.fromstring(page.text)
#print tree
#from BeautifulSoup import BeautifulSoup
#import bs4
from bs4 import BeautifulSoup
import urllib
def retrieveText(inputlink):
"""Fetch the text from a link to an HTML file"""
if (inputlink == np.nan):
return np.nan
html = urllib.urlopen(inputlink).read()
soup = BeautifulSoup(html)
texts = soup.findAll(text=True)
# http://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text
#def visible(element):
# if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
# return False
# elif element.parent.name isinstance(element, Comment):
# #elif re.match('<!--.*-->', str(element)):
# return False
# return True
#visible_texts = filter(visible, texts)
[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText()
return visible_text
#print retrieveText('https://www.sc.com/uk/about-us/index.html')
print retrieveText('http://www.growthintel.com/about-us/')
In [12]:
def createDescription(inputlink):
"""Link the findAboutUs() and retrieveText() functions to obtain company description from input link"""
if (isinstance(inputlink,list)):
inputlink = inputlink[0]
if (inputlink == np.nan):
return np.nan
else:
link = findAboutUs(inputlink)
text = retrieveText(link)
return text
#if (isinstance(inputlinks,list)):
# link = findAboutUs("http://portent.io")
# link = findAboutUs(inputlinks[0])
#else:
# link = findAboutUs(inputlinks)
#text = retrieveText(link)
#return text
#print createDescription(np.nan)
#print createDescription("https://www.sc.com/uk/")
testlink = "http://www.growthintel.com"
print createDescription(testlink)
In [13]:
##dfurls = dfurls.drop('CompanyDescription', 1)
#print dfurls[ pan.notnull(dfurls['URLs']) ]
#dfurls['AboutUsURL'] = dfurls['URLs'].apply(lambda x: findAboutUs(x))
In [14]:
#dfurls = dfurls.drop('AboutUsURL', 1)
#dfurls['CompanyDescription'] = dfurls['URLs'].apply(lambda x: createDescription(x))
#dfurls
#print dfurls.ix[dfurls['CompanyName'] == 'STANDARD CHARTERED NOMINEES LIMITED', 'CompanyDescription'].values
#os.chdir(datadir)
#descpklfile="descriptions.pkl"
#descfolderpath=os.path.join(datadir,descpklfile)
#if (os.path.exists(descfolderpath)==True):
# print("Pickle file containing company descriptions data found. Loading it...")
# dfurls=pickle.load(open(descfolderpath,'r'))
#else:
# print("Fetching company descriptions...")
# dfurls['CompanyDescription'] = dfurls['URLs'].apply(lambda x: createDescription(x))
# with open(descpklfile,'wb') as output:
# pickle.dump(dfurls, output, pickle.HIGHEST_PROTOCOL)
#os.chdir(rootdir)
#dfurls
In [13]:
AboutUsURLs = [["McKinsey & Company", "http://www.mckinsey.com/about_us"], ["The White Company", "http://www.thewhitecompany.com/help/our-story/"], ["Marks & Spencer", "http://corporate.marksandspencer.com/aboutus"], ["Kids Company", "http://www.kidsco.org.uk/about-us"], ["Thunderhead", "http://www.thunderhead.com/what-we-do/about-us/"], ["Aston Martin", "https://www.astonmartin.com/en/company/about-us"], ["Bicester Village", "http://www.bicestervillage.com/en/company/about-us"], ["Solarcentury", "http://www.solarcentury.com/uk/about-solarcentury/"], ["Student Loans Company", "http://www.slc.co.uk/about-us.aspx"], ["The Stationers' Company", "https://stationers.org/about.html"], ["Royal Shakespeare Company", "http://www.rsc.org.uk/about-us/"], ["Snell", "http://www.snellgroup.com/company/about-us/"], ["The Wax Chandlers Company", "http://www.waxchandlers.org.uk/about-us/index.php"], ["Expeditors", "http://www.expeditors.com/our-company/about-us.asp"], ["The Carbon Neutral Company", "http://www.carbonneutral.com/about-us"], ["The Pewterers' Company", "http://www.pewterers.org.uk/the_company/aboutus.html"], ["Vauxhall", "http://www.vauxhall.co.uk/about-vauxhall/about-us/company.html"], ["EE", "http://ee.co.uk/our-company/about-ee"], ["Candoco Dance Company", "http://www.candoco.co.uk/about-us/"], ["Victrex", "http://www.victrex.com/en/company/about-us"], ["Ensus", "http://www.ensus.co.uk/Company/About_us/"], ["Anglian Water", "http://www.anglianwater.co.uk/about-us/"], ["The Cheque and Credit Clearing Company", "http://www.chequeandcredit.co.uk/about_us/"], ["Vodafone", "http://www.vodafone.co.uk/about-us/company-history/"], ["People 1st","http://www.people1sttraining.co.uk/about-us"], ["Starbucks","http://www.starbucks.co.uk/about-us"], ["Merlin Entertainments","http://www.merlinentertainments.biz/about-us"], ["Bloomsbury Publishing","http://www.bloomsbury.com/uk/company/about-us/"], ["Alcatel One Touch","http://www.alcatelonetouch.com/global-en/company/aboutus.html"], ["Masons Kings","http://masonkings.jd-dealer.co.uk/About-us/Our-Company"], ["Oxford Bus Company","http://www.oxfordbus.co.uk/about-us/"], ["Patient.co.uk","http://www.patient.co.uk/about-us"], ["Bootstrap Company","http://www.bootstrapcompany.co.uk/about-us/"], ["Fusion Furniture","http://www.fusionfurniturecompany.co.uk/about.php"], ["Siemens","http://www.siemens.co.uk/en/about_us/"], ["Bosch UK","http://www.bosch.co.uk/en/uk/about_bosch_home_2/about-bosch-in-great-britain.php#"], ["Qualcomm","https://www.qualcomm.com/company/about"], ["Apple","https://www.apple.com/about/"], ["Mercedes-Benz UK","http://www2.mercedes-benz.co.uk/content/unitedkingdom/mpc/mpc_unitedkingdom_website/en/home_mpc/passengercars/home/passenger_cars_world/about_us.html"], ["IBM UK","http://www.ibm.com/ibm/uk/en/"], ["Google","https://www.google.co.uk/about/"], ["Intel","http://www.intel.com/content/www/us/en/company-overview/company-overview.html"], ["ebay","http://pages.ebay.co.uk/aboutebay.html"], ["WebMD","http://www.webmd.com/about-webmd-policies/about-who-we-are"], ["Growth Intelligence","http://www.growthintel.com/about-us/"] ]
#pprint(AboutUsURLs)
print len(AboutUsURLs)
cnames = [i for i,j in AboutUsURLs]
caboutusurls = [j for i,j in AboutUsURLs]
#print cnames
descdict = {'CompanyName' : pan.Series(cnames), 'AboutUsURL' : pan.Series(caboutusurls)}
descdf = pan.DataFrame(descdict)
descdf
Out[13]:
In [14]:
os.chdir(datadir)
descpklfile="descriptions.pkl"
descfolderpath=os.path.join(datadir,descpklfile)
if (os.path.exists(descfolderpath)==True):
print("Pickle file containing company descriptions data found. Loading it...")
descdf=pickle.load(open(descfolderpath,'r'))
else:
print("Fetching company descriptions...")
descdf['CompanyDescription'] = descdf['AboutUsURL'].apply(lambda x: retrieveText(x))
with open(descpklfile,'wb') as output:
pickle.dump(descdf, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)
descdf
Out[14]:
In [15]:
descdf
#print descdf.ix[descdf['CompanyName'] == 'Starbucks', 'CompanyDescription'].values
print descdf.ix[descdf['CompanyName'] == 'Starbucks', 'CompanyDescription'].values[0].encode('utf-8')
In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktWordTokenizer
#from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
english_stops = set(stopwords.words('english'))
def tokenizeString(string,lower=True,tokenizer="wordpunct"):
if tokenizer=="wordpunct":
tokenized=WordPunctTokenizer().tokenize(string)
if lower==True:
tokenized=[w.lower() for w in tokenized]
if tokenizer=="punktword":
tokenized=PunktWordTokenizer().tokenize(string)
if lower==True:
tokenized=[w.lower() for w in tokenized]
return tokenized
def cleanVector(tokens,clean=True,stopremove=True,minlen=2):
output=[]
disallowedchar=set(["!","?",'"',"'",",",".",":",";"])
english_stops = set(stopwords.words('english'))
for i in tokens:
found=False
if len(set(i).intersection(disallowedchar))>0:
found=True
if found==False and stopremove==False:
output.append(i)
if found==False and stopremove==True and minlen==0:
if i not in english_stops:
output.append(i)
if found==False and stopremove==True and minlen>0:
if i not in english_stops and len(i)>=minlen:
output.append(i)
return output
def stemVector(vector,method="lemmatize"):
output=[]
if method=='lemmatize':
wnl = WordNetLemmatizer()
for i in vector:
i=wnl.lemmatize(i)
output.append(i)
if method=='snowball':
st=EnglishStemmer()
for i in vector:
i=st.stem(i)
output.append(i)
if method=='porter':
st=PorterStemmer()
for i in vector:
i=st.stem(i)
output.append(i)
if method=='lancaster':
st=LancasterStemmer()
for i in vector:
i=st.stem(i)
output.append(i)
return output
def tokeniseCleanStem(inputstring):
return stemVector(cleanVector(tokenizeString(inputstring)))
os.chdir(datadir)
descpklfile="processeddescriptions.pkl"
descfolderpath=os.path.join(datadir,descpklfile)
if (os.path.exists(descfolderpath)==True):
print("Pickle file containing preprocessed company data found. Loading it...")
descdf=pickle.load(open(descfolderpath,'r'))
else:
print("Cleaning, tokenising and lemmatising company data text...")
descdf['Tokens'] = descdf['CompanyDescription'].apply(lambda x: tokeniseCleanStem(x))
with open(descpklfile,'wb') as output:
pickle.dump(descdf, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)
descdf
#print descdf['Tokens']
Out[16]:
In [17]:
from gensim import corpora,models
dictionary = corpora.Dictionary(descdf['Tokens'])
print dictionary
#print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in descdf['Tokens']]
#print(corpus)
tfidfmodel = models.TfidfModel(corpus)
# Apply it to the input corpus
tfidfcorpus = tfidfmodel[corpus]
#print(tfidfcorpus)
dictpath = os.path.join(datadir,'companies.dict')
dictionary.save(dictpath)
corpuspath = os.path.join(datadir,'corpus.mm')
corpora.MmCorpus.serialize(corpuspath, corpus)
In [21]:
import logging
logging.basicConfig(filename='companies.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#id2word = corpora.Dictionary.load_from_text(dictpath)
id2word = dictionary
#mm = corpora.MmCorpus(corpuspath)
mm = tfidfcorpus
lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=2, update_every=1, chunksize=10000, passes=10)
ldapath = os.path.join(datadir,'companies_lda.model')
lda.save(ldapath)
lda.print_topics(10)
Out[21]:
In [22]:
from gensim.similarities import Similarity
from gensim import similarities
query = "Electronics appliances"
query = dictionary.doc2bow(tokeniseCleanStem(query))
# Apply the LDA model trained on the corpus to the query
query_lda = lda[query]
print "\nThe similarity of the query with each one of the computed topics is:\n"
print(query_lda)
index = similarities.MatrixSimilarity(lda[tfidfcorpus])
print "\n\nThe similarity of the query to the documents in the corpus is:\n"
sims = index[query_lda] # perform a similarity query against the corpus
resultlist = list(enumerate(sims))
print(resultlist)
print "\n\nThe company which best fits the query by LDA-deduced topics is:\n"
resultlist.sort(key=lambda x: x[1], reverse=True)
result = resultlist[0][0]
print descdf.iloc[result]