notebook.community

Edit and run



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import os,sys
#import csv
import pandas as pan
import cPickle as pickle
import pprint
#import glob
#import tables  #PyTables used to generate HDF5 file instead of pickle

%matplotlib inline

Companies House data set used as a list of source companies which could be of interest for B2B lead generation. Obtain the data set here: http://download.companieshouse.gov.uk/en_output.html

First we read the data set into a Pandas DataFrame and serialise it into a pickle file.



In [2]:

    
rootdir="/home/ilan/Desktop/GI_interview_project"
datadir="/home/ilan/Desktop/GI_interview_project/company_data"

os.chdir(datadir)

pklfile="data.pkl"
#hffile="data.h5"
folderpath=os.path.join(datadir,pklfile)
#folderpath=os.path.join(rootdir,hffile)
if (os.path.exists(folderpath)==True):
    print("Pickle file containing data found. Loading it...")
    data=pickle.load(open(folderpath,'r'))
    #data = tables.open_file(folderpath, driver="H5FD_CORE")
else:
    print("Reading in csv file and creating pickle...")
    filenames =['BasicCompanyData-2015-05-01-part1_5.csv', 'BasicCompanyData-2015-05-01-part2_5.csv',\
            'BasicCompanyData-2015-05-01-part3_5.csv', 'BasicCompanyData-2015-05-01-part4_5.csv',\
            'BasicCompanyData-2015-05-01-part5_5.csv']
    list_ = []
#    for i,j in enumerate(filenames):
#        if (i == 0):
#            data = pan.read_csv(j, delimiter=',',index_col=False)
#            list_.append(data)
#            print data.head(1)
#        elif (i > 0):
#            data = pan.read_csv(j, delimiter=',',skiprows=1,index_col=False)
#            list_.append(data)
#            print data.head(1)
#    data = pan.concat(list_)
    for i in filenames:
        data = pan.read_csv(i, delimiter=',',index_col=False)
        list_.append(data)
        #print data.head(1)
    data = pan.concat(list_)
    # Remove dots and whitespaces from column titles
    colnames = [str(i).replace('.','_').strip() for i in list(data.columns.values)]
    data.columns=colnames
    # Remove period in the label column
    #data['Label']=data['Label'].apply(lambda x: x.strip('.'))
    with open(pklfile,'wb') as output:
        pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)

os.chdir(rootdir)
        
data









    



Pickle file containing data found. Loading it...






    Out[2]:






  
    
      
      CompanyName
      CompanyNumber
      RegAddress_CareOf
      RegAddress_POBox
      RegAddress_AddressLine1
      RegAddress_AddressLine2
      RegAddress_PostTown
      RegAddress_County
      RegAddress_Country
      RegAddress_PostCode
      ...
      PreviousName_6_CONDATE
      PreviousName_6_CompanyName
      PreviousName_7_CONDATE
      PreviousName_7_CompanyName
      PreviousName_8_CONDATE
      PreviousName_8_CompanyName
      PreviousName_9_CONDATE
      PreviousName_9_CompanyName
      PreviousName_10_CONDATE
      PreviousName_10_CompanyName
    
  
  
    
      0     
                                                   ! LTD
       08209948
                                  NaN
       NaN
                METROHOUSE 57 PEPPER ROAD
                          HUNSLET
                     LEEDS
                   YORKSHIRE
                  NaN
       LS10 2RU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      1     
                            !BIG IMPACT GRAPHICS LIMITED
       07382019
                                  NaN
       NaN
                         335 ROSDEN HOUSE
                   372 OLD STREET
                    LONDON
                         NaN
                  NaN
       EC1V 9AV
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      2     
                                     !K7 TOURING LIMITED
       08937297
       C/O THE GREENE PARTNERSHIP LLP
       NaN
                   10TH FLOOR MAPLE HOUSE
                      HIGH STREET
               POTTERS BAR
               HERTFORDSHIRE
                  NaN
        EN6 5BA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      3     
                                            !NFERNO LTD.
       04753368
                                  NaN
       NaN
        FIRST FLOOR THAVIES INN HOUSE 3-4
                   HOLBORN CIRCUS
                    LONDON
                         NaN
                  NaN
       EC1N 2HA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      4     
                                            !NSPIRED LTD
       SC421617
                                  NaN
       NaN
                     12 BON ACCORD SQUARE
                              NaN
                  ABERDEEN
                         NaN
                  NaN
       AB11 6DJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      5     
                                 !NVERTD DESIGNS LIMITED
       09152972
                                  NaN
       NaN
                          32 RECTORY ROAD
                              NaN
               STEPPINGLEY
                         NaN
       UNITED KINGDOM
       MK45 5AT
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      6     
                                           !OBAC LIMITED
       FC031362
                                  NaN
       NaN
       1ST AND 2ND FLOORS ELIZABETH HOUSE
               LES RUETIES BRAYES
             ST PETER PORT
                     GY1 1EW
             GUERNSEY
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      7     
                                        !OBAC UK LIMITED
       07687209
                                  NaN
       NaN
                          ENDEAVOUR HOUSE
                 COOPERS END ROAD
          STANSTED AIRPORT
                         NaN
                  NaN
       CM24 1SJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      8     
                                      !YOZO FASS LIMITED
       02714021
                                  NaN
       NaN
                         1 VERONICA HOUSE
                     WICKHAM ROAD
                  BROCKLEY
                         NaN
                  NaN
        SE4 1NQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      9     
                                         "1 C O LIMITED"
       03811958
                                  NaN
       NaN
                               FANE COURT
            GREEN ROAD SHIPBOURNE
                 TONBRIDGE
                        KENT
                  NaN
       TN11 9PL
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      10    
                                      "2 ECOUTE" LIMITED
       06439541
                                  NaN
       NaN
                        38 PAXTON GARDENS
                           WOKING
                    SURREY
                         NaN
                  NaN
       GU21 5TS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      11    
             "243 RUGBY ROAD MANAGEMENT COMPANY LIMITED"
       05914136
                                  NaN
       NaN
                            45 SUMMER ROW
                              NaN
                BIRMINGHAM
               WEST MIDLANDS
                  NaN
         B3 1JJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      12    
                  "309" WEST END LANE MANAGEMENT LIMITED
       02943302
                                  NaN
       NaN
                7 GRANARD BUSINESS CENTRE
                       BUNNS LANE
                    LONDON
                         NaN
              ENGLAND
        NW7 2DQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      13    
                        "A LITTLE BIT DIFFERENT" LIMITED
       08878402
                                  NaN
       NaN
                           1 LOCKE STREET
                              NaN
                  BARNSLEY
             SOUTH YORKSHIRE
                  NaN
        S70 6ND
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      14    
                                "A TASTE OF TUSCANY" LTD
       06473722
                                  NaN
       NaN
              5 ELSTREE GATE, ELSTREE WAY
                      BOREHAMWOOD
             HERTFORDSHIRE
                         NaN
                  NaN
        WD6 1JD
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      15    
                                        "A" ADVISORY LLP
       OC355684
                                  NaN
       NaN
                                3RD FLOOR
                 5 LLOYD'S AVENUE
                    LONDON
                         NaN
                  NaN
       EC3N 3AE
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      16    
                                    "A" CERAMICS LIMITED
       04494986
                                  NaN
       NaN
                      132 MANCHESTER ROAD
                             SHAW
                    OLDHAM
                  LANCASHIRE
                  NaN
        OL2 7DD
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      17    
                                     "A" CONCEPT LIMITED
       02537158
                                  NaN
       NaN
                             31 OVAL ROAD
                      CAMDEN TOWN
                    LONDON
                         NaN
                  NaN
        NW1 7EA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      18    
                                "A" TRAFFIC SOLUTION LTD
       05852396
                                  NaN
       NaN
                         THE OLD EXCHANGE
             234 SOUTHCHURCH ROAD
           SOUTHEND-ON-SEA
                       ESSEX
                  NaN
        SS1 2EG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      19    
                              "A-Z-ENGINEERS LONDON" LTD
       09025577
                                  NaN
       NaN
          203 LONDON ROAD 203 LONDON ROAD
                           FLAT 4
            LONDON,MITCHAM
                         NaN
       UNITED KINGDOM
        CR4 2JD
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      20    
                              "A.B.J.Z DRIVER HIRE " LTD
       09151627
                                  NaN
       NaN
                       32A WOLSDON STREET
                   WOLSDON STREET
                  PLYMOUTH
                         NaN
       UNITED KINGDOM
        PL1 5EH
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      21    
                               "A.K.WELDING SERVICE" LTD
       08981806
                                  NaN
       NaN
                      15 TEAGUES CRESCENT
                           TRENCH
                   TELFORD
                  SHROPSHIRE
       UNITED KINGDOM
        TF2 6RQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      22    
                                  "AA LOGISTIKS" LIMITED
       09478701
                                  NaN
       NaN
                          25 CULVERT ROAD
                              NaN
                    LONDON
                         NaN
       UNITED KINGDOM
        N15 5HF
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      23    
                     "AGAD" ADVERTISING CORPORATION LTD.
       09465805
                                  NaN
       NaN
                       131 OATLANDS DRIVE
                              NaN
                    SLOUGH
                         NaN
       UNITED KINGDOM
        SL1 3HN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      24    
                          "AJA PROPERTY DEVELOPMENT LTD"
       05651002
                                  NaN
       NaN
                      10 WOODBERRY AVENUE
                           HARROW
                 MIDDLESEX
                         NaN
                  NaN
        HA2 6AU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      25    
                  "ALL WRAPPED UP" EVENTS MANAGEMENT LTD
       SC313991
                                  NaN
       NaN
                      5 MELVILLE CRESCENT
                              NaN
                 EDINBURGH
                         NaN
                  NaN
        EH3 7JA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      26    
                                    "ALTAI CASHMERE" LLC
       FC027187
                                  NaN
       NaN
              BUILDING LEFT TO THE "TUUL"
                     DRY CLEANING
         KHAN-UUL DISTRICT
       ULAANBAATAR, MONGOLIA
             MONGOLIA
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      27    
                                       "AND BREATHE" LTD
       09008930
                                  NaN
       NaN
                       20 HORNCASTLE ROAD
                              NaN
                    BOSTON
                         NaN
       UNITED KINGDOM
       PE21 9BU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      28    
                                  "APOLLO'S BAR" LIMITED
       09044937
                                  C/O
       NaN
                            4 TAKELY RIDE
                              NaN
                  BASILDON
                       ESSEX
              ENGLAND
       SS16 5BE
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      29    
       "ARTHUR BALFOUR",CONSERVATIVE WORKING MEN'S CL...
       IP10067R
                                  NaN
       NaN
                                      NaN
                              NaN
                       NaN
                         NaN
                  NaN
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      211048
                                           \ COMPANY LTD
       05060411
                                  NaN
       NaN
            WWW.BUY-THIS-COMPANY-NAME.COM
       SUITE B,  29 HARLEY STREET
                    LONDON
                      LONDON
                  NaN
        W1G 9QR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211049
                                          \TOOLTRAC\ LTD
       06465593
                                  NaN
       NaN
                  DEPT 302 43 OWSTON ROAD
                         CARCROFT
                 DONCASTER
             SOUTH YORKSHIRE
                  NaN
        DN6 8DA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211050
                 ]PERFORMANCE S P A C E [ STUDIOS C.I.C.
       09138062
                                  NaN
       NaN
                               SWAN WHARF
                    60A DACE ROAD
                    LONDON
                         NaN
                  NaN
         E3 2NQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211051
                                        ]PS[ STUDIOS LTD
       08046314
                                  NaN
       NaN
                           UNIT 7 ENCLAVE
                50 RESOLUTION WAY
                    LONDON
                         NaN
       UNITED KINGDOM
        SE8 4AL
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211052
                                _XURBIA_XENDLESS LIMITED
       06312240
                                  NaN
       NaN
                 41 GREAT PORTLAND STREET
                              NaN
                    LONDON
                         NaN
                  NaN
        W1W 7LA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211053
                      `AT YOUR SERVICE` (WALES)  LIMITED
       05658675
                                  NaN
       NaN
        UNIT H/I LONLAS VILLAGE WORKSHOPS
             LONLAS BUSINESS PARK
                    SKEWEN
                       NEATH
                  NaN
       SA10 6RR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211054
                                     `DESIGNBLU´ LIMITED
       06904076
                                  NaN
       NaN
            CORNERSTONE HOUSE MIDLAND WAY
                        THORNBURY
                   BRISTOL
                         NaN
                  NaN
       BS35 2BS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211055
       `THE JERUSALEM ARTS TRUST FOR WALES` - CANOLFA...
       06585960
                                  NaN
       NaN
                     SALEM BAPTIST CHAPEL
                        BELL BANK
                HAY-ON-WYE
                         NaN
                  NaN
        HR3 5AE
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211056
                                        {BOBA HEADS} LTD
       09470943
                                  NaN
       NaN
                       10 BOWBROOK GRANGE
                              NaN
                SHREWSBURY
                  SHROPSHIRE
       UNITED KINGDOM
        SY3 8XT
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211057
                                               £ LIMITED
       09471545
                                  NaN
       NaN
                    DEPT 2 43 OWSTON ROAD
                         CARCROFT
                 DONCASTER
             SOUTH YORKSHIRE
       UNITED KINGDOM
        DN6 8DA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211058
                             £1 BAGUETTES & PIES LIMITED
       08458719
                                  NaN
       NaN
                 UNIT 62 LONGTON EXCHANGE
                          LONGTON
            STOKE-ON-TRENT
                      STAFFS
                  NaN
        ST3 2JA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211059
                             £1 SANDWICH COMPANY LIMITED
       09254601
                                  NaN
       NaN
                         4 PEMBROKE COURT
                              NaN
       NEWCASTLE UPON TYNE
                         NaN
              ENGLAND
        NE3 2YT
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211060
                                            £1 STORE LTD
       07489976
                                  NaN
       NaN
                              26 PARVILLS
                         PARVILLS
             WALTHAM ABBEY
                       ESSEX
              ENGLAND
        EN9 1QG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211061
                                     £10 RECORDS LIMITED
       SC422612
                                  NaN
       NaN
                       4 SAUCHENHALL PATH
                      MOODIESBURN
                   GLASGOW
           NORTH LANARKSHIRE
                  NaN
        G69 0NS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211062
                                            £1K CARS LTD
       07607359
                     PRUDHOE AUTOCARE
       NaN
                  THE OLD CO-OP BUILDINGS
                TYNE VIEW TERRACE
                   PRUDHOE
              NORTHUMBERLAND
       UNITED KINGDOM
       NE42 5PX
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211063
                                     £ASY AS 123 LIMITED
       09197224
                                  NaN
       NaN
                           5 DESPARD ROAD
                    EASTERN GREEN
                  COVENTRY
               WEST MIDLANDS
       UNITED KINGDOM
        CV5 7DG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211064
                                £CHESHAM ESTATES LIMITED
       04968129
                                  NaN
       NaN
                         DEVONSHIRE HOUSE
                  60 GOSWELL ROAD
                    LONDON
                         NaN
                  NaN
       EC1M 7AD
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211065
                                          £DUCASHION LTD
       07625283
                                  NaN
       NaN
                           3 MAYHILL ROAD
                              NaN
                 GREENWICH
              GREATER LONDON
                  NaN
        SE7 7JG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211066
                                                 £££ LTD
       08344093
                                  NaN
       NaN
               METRO HOUSE 57 PEPPER ROAD
                          HUNSLET
                     LEEDS
                   YORKSHIRE
                  NaN
       LS10 2RU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211067
                                      £££ SAVE TYRES LTD
       09475149
                                  NaN
       NaN
                                   UNIT 1
                       BALME ROAD
               CLECKHEATON
              WEST YORKSHIRE
       UNITED KINGDOM
       BD19 4EW
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211068
                                                ¥IVA LTD
       09460559
                                  NaN
       NaN
                         6 SHIREHALL PARK
                              NaN
                    HENDON
                         NaN
       UNITED KINGDOM
        NW4 2QL
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211069
                                       ÁLPHA MASCHIO LTD
       09468796
                                  NaN
       NaN
                          28 LEONARD ROAD
                      FOREST GATE
                    LONDON
                         NaN
              ENGLAND
         E7 0DB
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211070
                                  ÉLAN INTERNATIONAL LTD
       09554305
                                  NaN
       NaN
                     71-75 SHELTON STREET
                              NaN
                    LONDON
                         NaN
       UNITED KINGDOM
       WC2H 9JQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211071
                          ÉLAN PROJECTS CONSULTANCY LTD.
       09481566
                                  NaN
       NaN
                       31 BARDOLPH STREET
                              NaN
                 LEICESTER
                         NaN
       UNITED KINGDOM
        LE4 6EH
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211072
                                  ÉTOILE CONSULTANCY LTD
       09498296
                                  NaN
       NaN
                   414-416 BLACKPOOL ROAD
                 ASHTON-ON-RIBBLE
                   PRESTON
                       LANCS
       UNITED KINGDOM
        PR2 2DX
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211073
                            Ó MUIRIGH SOLICITORS LIMITED
       NI629669
                                  NaN
       NaN
                   24-26 SPRINGFIELD ROAD
                              NaN
                   BELFAST
                         NaN
                  NaN
       BT12 7AG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211074
                                   ‘OW ‘BOUT ME? LIMITED
       09495226
                                  NaN
       NaN
                         5 DUCKETTS WHARF
                     SOUTH STREET
        BISHOP'S STORTFORD
               HERTFORDSHIRE
       UNITED KINGDOM
       CM23 3AR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211075
                         “SAIL IN GREECE ADVENTURES” LTD
       09511422
                                  NaN
       NaN
                      INTERNATIONAL HOUSE
               24 HOLBORN VIADUCT
                    LONDON
                         NaN
              ENGLAND
       EC1A 2BN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211076
                                    €URO IMPORTS LIMITED
       08182582
                                  NaN
       NaN
                       PONDSIDE MILL LANE
                           INSKIP
                   PRESTON
                         NaN
                  NaN
        PR4 0TP
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      211077
                                            €UROTECH LTD
       06642625
                                  NaN
       NaN
                REED HOUSE 16 HIGH STREET
                    WEST WRATTING
                 CAMBRIDGE
              CAMBRIDGESHIRE
                  NaN
       CB21 5LU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
  

3611077 rows × 53 columns

To get a feel for the data set we do some basic data set exploration.



In [3]:

    
print data.columns
print data.size
data.describe()









    



Index([u'CompanyName', u'CompanyNumber', u'RegAddress_CareOf', u'RegAddress_POBox', u'RegAddress_AddressLine1', u'RegAddress_AddressLine2', u'RegAddress_PostTown', u'RegAddress_County', u'RegAddress_Country', u'RegAddress_PostCode', u'CompanyCategory', u'CompanyStatus', u'CountryOfOrigin', u'DissolutionDate', u'IncorporationDate', u'Accounts_AccountRefDay', u'Accounts_AccountRefMonth', u'Accounts_NextDueDate', u'Accounts_LastMadeUpDate', u'Accounts_AccountCategory', u'Returns_NextDueDate', u'Returns_LastMadeUpDate', u'Mortgages_NumMortCharges', u'Mortgages_NumMortOutstanding', u'Mortgages_NumMortPartSatisfied', u'Mortgages_NumMortSatisfied', u'SICCode_SicText_1', u'SICCode_SicText_2', u'SICCode_SicText_3', u'SICCode_SicText_4', u'LimitedPartnerships_NumGenPartners', u'LimitedPartnerships_NumLimPartners', u'URI', u'PreviousName_1_CONDATE', u'PreviousName_1_CompanyName', u'PreviousName_2_CONDATE', u'PreviousName_2_CompanyName', u'PreviousName_3_CONDATE', u'PreviousName_3_CompanyName', u'PreviousName_4_CONDATE', u'PreviousName_4_CompanyName', u'PreviousName_5_CONDATE', u'PreviousName_5_CompanyName', u'PreviousName_6_CONDATE', u'PreviousName_6_CompanyName', u'PreviousName_7_CONDATE', u'PreviousName_7_CompanyName', u'PreviousName_8_CONDATE', u'PreviousName_8_CompanyName', u'PreviousName_9_CONDATE', u'PreviousName_9_CompanyName', u'PreviousName_10_CONDATE', u'PreviousName_10_CompanyName'], dtype='object')
191387081






    Out[3]:






  
    
      
      DissolutionDate
      Accounts_AccountRefDay
      Accounts_AccountRefMonth
      Mortgages_NumMortCharges
      Mortgages_NumMortOutstanding
      Mortgages_NumMortPartSatisfied
      Mortgages_NumMortSatisfied
      LimitedPartnerships_NumGenPartners
      LimitedPartnerships_NumLimPartners
    
  
  
    
      count
        0
       3565670.000000
       3565670.000000
       3611077.000000
       3611077.000000
       3611077.000000
       3611077.000000
       3611077.000000
       3611077.000000
    
    
      mean
      NaN
            30.261290
             6.351142
             0.715750
             0.446035
             0.000648
             0.268702
             0.009763
             0.032917
    
    
      std
      NaN
             2.639808
             3.621669
             9.078691
             7.347536
             0.072631
             4.631352
             0.142520
             1.336482
    
    
      min
      NaN
             1.000000
             1.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
            -5.000000
    
    
      25%
      NaN
            30.000000
             3.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
    
    
      50%
      NaN
            31.000000
             6.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
    
    
      75%
      NaN
            31.000000
            10.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
             0.000000
    
    
      max
      NaN
            31.000000
            12.000000
          6121.000000
          6121.000000
            81.000000
          5720.000000
           110.000000
           823.000000



In [4]:

    
# All the labels in the data, and their counts
categorycounts=data['CompanyCategory'].value_counts()
print categorycounts
categorycounts.plot(kind='bar')









    



Private Limited Company                                         3340497
PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital)      88475
Limited Liability Partnership                                     60148
PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)      42215
Limited Partnership                                               32777
Other company type                                                11724
Community Interest Company                                        10724
Industrial and Provident Society                                  10108
Public Limited Company                                             7507
Private Unlimited Company                                          4914
Royal Charter Company                                               850
Investment Company with Variable Capital                            547
Private Unlimited                                                   234
Registered Society                                                  157
Investment Company with Variable Capital(Umbrella)                   87
European Public Limited-Liability Company (SE)                       43
Old Public Company                                                   28
PRIV LTD SECT. 30 (Private limited company, section 30 of the Companies Act)         19
Other Company Type                                                   12
Investment Company with Variable Capital (Securities)                11
dtype: int64






    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f29f5f03650>



In [5]:

    
# All the labels in the data, and their counts
statuscounts=data['CompanyStatus'].value_counts()
print statuscounts
statuscounts.plot(kind='bar')









    



Active                                              3366544
Active - Proposal to Strike off                      154723
Liquidation                                           80033
In Administration                                      3467
Live but Receiver Manager on at least one charge       2390
Voluntary Arrangement                                  1649
ADMINISTRATIVE RECEIVER                                1335
In Administration/Administrative Receiver               356
RECEIVERSHIP                                            268
ADMINISTRATION ORDER                                    156
RECEIVER MANAGER / ADMINISTRATIVE RECEIVER               80
In Administration/Receiver Manager                       66
VOLUNTARY ARRANGEMENT / RECEIVER MANAGER                  8
In Administration/Receivership                            1
VOLUNTARY ARRANGEMENT / ADMINISTRATIVE RECEIVER           1
dtype: int64






    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f2a2cf1d690>



In [6]:

    
class Mask(object):
    def __init__(self,df,field,match):
        self.df = df
        self.field = field
        self.match = match
        self.function = lambda x, y, z: x.loc[x[y] == z]
    def __call__(self):
        return self.function(self.df,self.field,self.match)
        #return self.df.loc[self.df[self.field] == self.match]
        

#data[data.CompanyName == "! LTD"]
#data.loc[data["CompanyName"] == "! LTD"]
result = Mask(data, "CompanyName", "! LTD")
print result()









    



  CompanyName CompanyNumber RegAddress_CareOf RegAddress_POBox  \
0       ! LTD      08209948               NaN              NaN   

     RegAddress_AddressLine1 RegAddress_AddressLine2 RegAddress_PostTown  \
0  METROHOUSE 57 PEPPER ROAD                 HUNSLET               LEEDS   

  RegAddress_County RegAddress_Country RegAddress_PostCode  \
0         YORKSHIRE                NaN            LS10 2RU   

              ...             PreviousName_6_CONDATE  \
0             ...                                NaN   

  PreviousName_6_CompanyName PreviousName_7_CONDATE  \
0                        NaN                    NaN   

   PreviousName_7_CompanyName PreviousName_8_CONDATE  \
0                         NaN                    NaN   

   PreviousName_8_CompanyName  PreviousName_9_CONDATE  \
0                         NaN                     NaN   

  PreviousName_9_CompanyName PreviousName_10_CONDATE  \
0                        NaN                     NaN   

  PreviousName_10_CompanyName  
0                         NaN  

[1 rows x 53 columns]



In [7]:

    
class booleanMask(object):
    def __init__(self,function):
        self.function = function
    #def __and__(self,other):
    #    self.function = self.function & other.function
    def __call__(self,df):
        self.df = df
        return map(self.function, [self.df])[0]
    
company_mask = booleanMask(lambda x: x.CompanyName == "! LTD")
##print company_mask(data)
print data[company_mask(data)]


# MASKS CAN NOW BE COMBINED
#uk_mask = booleanMask(lambda x: x.RegAddress_Country == "UNITED KINGDOM")
#active_mask = booleanMask(lambda x: x.CompanyStatus == "Active")
#print data[uk_mask(data) & active_mask(data)]

# FOR VALIDATION TO MAKE SURE BOOLEANMASK IS GIVING WHAT WE EXPECT
#data.loc[(data["RegAddress_Country"] == "UNITED KINGDOM") & (data["CompanyStatus"] == "Active")]
#print len(data.loc[(data["RegAddress_Country"] == "UNITED KINGDOM") & (data["CompanyStatus"] == "Active")])
#print len(data[uk_mask(data) & active_mask(data)])
#print map(lambda x: x.CompanyName == "! LTD", [data])









    



  CompanyName CompanyNumber RegAddress_CareOf RegAddress_POBox  \
0       ! LTD      08209948               NaN              NaN   

     RegAddress_AddressLine1 RegAddress_AddressLine2 RegAddress_PostTown  \
0  METROHOUSE 57 PEPPER ROAD                 HUNSLET               LEEDS   

  RegAddress_County RegAddress_Country RegAddress_PostCode  \
0         YORKSHIRE                NaN            LS10 2RU   

              ...             PreviousName_6_CONDATE  \
0             ...                                NaN   

  PreviousName_6_CompanyName PreviousName_7_CONDATE  \
0                        NaN                    NaN   

   PreviousName_7_CompanyName PreviousName_8_CONDATE  \
0                         NaN                    NaN   

   PreviousName_8_CompanyName  PreviousName_9_CONDATE  \
0                         NaN                     NaN   

  PreviousName_9_CompanyName PreviousName_10_CONDATE  \
0                        NaN                     NaN   

  PreviousName_10_CompanyName  
0                         NaN  

[1 rows x 53 columns]



In [8]:

    
# DEFINE A REDUCED DATASET FOR PROTOTYPING
from random import sample
# number/fraction of entries to use
#ents = int(len(X)*0.1)
ents = 100
# Take a random sample from the data
smalldataind = sample(range(0,len(data)-1),ents)
#print smalldataind

# HERE
#smalldataind = [784400, 333248, 3037529, 333413, 1851904, 1569996, 2958604, 769824, 2848095, 896580]

smalldata = data.iloc[smalldataind]
smalldata









    Out[8]:






  
    
      
      CompanyName
      CompanyNumber
      RegAddress_CareOf
      RegAddress_POBox
      RegAddress_AddressLine1
      RegAddress_AddressLine2
      RegAddress_PostTown
      RegAddress_County
      RegAddress_Country
      RegAddress_PostCode
      ...
      PreviousName_6_CONDATE
      PreviousName_6_CompanyName
      PreviousName_7_CONDATE
      PreviousName_7_CompanyName
      PreviousName_8_CONDATE
      PreviousName_8_CompanyName
      PreviousName_9_CONDATE
      PreviousName_9_CompanyName
      PreviousName_10_CONDATE
      PreviousName_10_CompanyName
    
  
  
    
      175175
                                        EARO ESTATES LTD
       09480942
                     NaN
       NaN
                                         4 MELTON ROAD
                                                    NaN
              MANCHESTER
                   NaN
                ENGLAND
         M8 4HG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      207052
                               AMERICAN EXPRESSO LIMITED
       04255922
                     NaN
       NaN
                                             SUITE 100
                           THE STUDIO ST NICHOLAS CLOSE
                 ELSTREE
         HERTFORDSHIRE
                    NaN
        WD6 3EW
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      390136
                                               SIXTEEN25
       08230187
                     NaN
       NaN
                                     10 DENMARK STREET
                                                    NaN
                  LONDON
                   NaN
                    NaN
       WC2H 8LS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      38519 
                               DATASAFE SERVICES LIMITED
       01999773
                     NaN
       NaN
                                          UNITED HOUSE
                                             NORTH ROAD
                  LONDON
                   NaN
                    NaN
         N7 9DP
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      354624
                                          BALLYNARIS LTD
       NI621115
                     NaN
       NaN
                                      55 MILEBUSH ROAD
                                                    NaN
                 DROMORE
              CO. DOWN
                    NaN
       BT25 1RU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      533389
                                SUBSEA INSPECTION SW LTD
       09156040
                     NaN
       NaN
                                              TIMBERLY
                                           SOUTH STREET
               AXMINSTER
                 DEVON
         UNITED KINGDOM
       EX13 5AD
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      446674
                  SOUTHWOOD BUILDING CONTRACTORS LIMITED
       06389143
                     NaN
       NaN
                       UNIT 16 ENDEAVOUR BUSINESS PARK
                                         CROW ARCH LANE
                RINGWOOD
             HAMPSHIRE
                    NaN
       BH24 1PN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      471345
                                 MISS B CREATIVE LIMITED
       07248291
                     NaN
       NaN
                                     470 HUCKNALL ROAD
                                                    NaN
              NOTTINGHAM
       NOTTINGHAMSHIRE
                    NaN
        NG5 1FX
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      249166
                      S D C CONSULTANCY SERVICES LIMITED
       06767060
                     NaN
       NaN
                                39 WHITE CLOVER SQUARE
                                                    NaN
                    LYMM
              CHESHIRE
                    NaN
       WA13 0RX
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      245418
                          APPLECORE DEVELOPMENTS LIMITED
       04183144
                     NaN
       NaN
                                    PORTWAY HILL HOUSE
                                           PORTWAY HILL
                BATCOMBE
              SOMERSET
                    NaN
        BA4 6BR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      137301
                  KMJ PROPERTY (TUNBRIDGE WELLS) LIMITED
       06915745
                     NaN
       NaN
                                      BUCKINGHAM HOUSE
                                            MYRTLE LANE
           BILLINGSHURST
           WEST SUSSEX
                    NaN
       RH14 9SG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      8488  
                        PRC PLUMBING AND HEATING LIMITED
       08310381
       CERTAX ACCOUNTING
       NaN
                              ALCESTER BUSINESS CENTRE
                                    KINWARTON FARM ROAD
                ALCESTER
          WARWICKSHIRE
                    NaN
        B49 6EL
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      834199
                                            UMSI LIMITED
       FC031028
                     NaN
       NaN
                                   BRANCH REGISTRATION
                               REFER TO PARENT REGISTRY
                     NaN
                   NaN
                    NaN
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      332724
                                             AZA-MAB LTD
       09439314
                     NaN
       NaN
                                     88 COMMERCIAL WAY
                                                    NaN
                 PECKHAM
                   NaN
         UNITED KINGDOM
       SE15 5GG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      647771
                              HEXAGON ASSOCIATES LIMITED
       NI627785
                     NaN
       NaN
                                                UNIT 1
                          22-218 UPPER NEWTOWNARDS ROAD
                 BELFAST
                   NaN
       NORTHERN IRELAND
        BT4 3ET
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      270759
                             ARROW GLOBAL MASSEY LIMITED
       08612076
                     NaN
       NaN
                                     20-22 BEDFORD ROW
                                                    NaN
                  LONDON
                   NaN
                    NaN
       WC1R 4JS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      479413
                                   GILESWOODFORD LIMITED
       05243089
                     NaN
       NaN
                                        UNITY CHAMBERS
                                    34 HIGH EAST STREET
              DORCHESTER
                DOREST
                    NaN
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      658126
                           HIGHWOOD HOUSE PUBLISHING LTD
       05112696
                     NaN
       NaN
                                        HIGHWOOD HOUSE
                                   WINTERS LANE REDHILL
                 BRISTOL
                   NaN
                    NaN
       BS40 5SH
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      51107 
                            DAZZITTO PHOTOGRAPHY LIMITED
       08425973
                     NaN
       NaN
                                        50 ERNEST ROAD
                                                    NaN
                 CHATHAM
                  KENT
                    NaN
        ME4 5PT
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      181148
                      EASTHOLME COURT MANAGEMENT LIMITED
       02198033
                     NaN
       NaN
                                      7 COTTONS MEADOW
                                              KINGSTONE
           HEREFORDSHIRE
                   NaN
                    NaN
        HR2 9EW
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      221642
                                    LIBERATION MEDIA LTD
       07955916
                     NaN
       NaN
                                         32 DAM STREET
                                                    NaN
               LICHFIELD
         STAFFORDSHIRE
                    NaN
       WS13 6AA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      61971 
                                        A M AGENCIES LTD
       NI071258
                     NaN
       NaN
                                   UNIT B6 CLARA HOUSE
       DUNMURRY OFFICE PARKUPPER DUNMURRY LANE DUNMURRY
                 BELFAST
                ANTRIM
                    NaN
       BT17 0AJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      308502
                               FAA INSTALLATIONS LIMITED
       08070586
                     NaN
       NaN
                                       8 HARVEST CLOSE
                                                    NaN
              PONTEFRACT
        WEST YORKSHIRE
                    NaN
        WF8 2UR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      743155
                    COFFEE CULTURE CAFE & EATERY LIMITED
       09346166
                     NaN
       NaN
                                         EMERSON HOUSE
                                             HEYES LANE
           ALDERLEY EDGE
                   NaN
         UNITED KINGDOM
        SK9 7LF
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      122930
                     ADERYN BUILDING CONSULTANCY LIMITED
       09127060
                     NaN
       NaN
                          RADNOR HOUSE GREENWOOD CLOSE
                             CARDIFF GATE BUSINESS PARK
                 CARDIFF
                   NaN
         UNITED KINGDOM
       CF23 8AA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      555312
                                BUZZWORKS HOTELS LIMITED
       SC171299
                     NaN
       NaN
                                       132 MAIN STREET
                                                    NaN
               PRESTWICK
              AYRSHIRE
                    NaN
        KA9 1PB
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      48011 
                                       JORDAN-HOWELL LTD
       08942110
                     NaN
       NaN
                                      3 SOWOOD GARDENS
                                                    NaN
                  OSSETT
        WEST YORKSHIRE
                    NaN
        WF5 0SP
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      527074
                                  BROOMCO (3113) LIMITED
       NF003379
                     NaN
       NaN
                                                UNIT 9
                             GORTRUSH INDUSTRIAL ESTATE
                   OMAGH
             CO TYRONE
                    NaN
       BT78 5EJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      127133
                                     DOREL (EUROPE) LTD.
       03278346
                     NaN
       NaN
                                 1168/1170 MELTON ROAD
                                                 SYSTON
               LEICESTER
        LEICESTERSHIRE
                    NaN
        LE7 2HB
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      665016
                                OLD SPOT BREWERY LIMITED
       05384602
                     NaN
       NaN
                               MANOR FARM STATION ROAD
                                           CULLINGWORTH
                BRADFORD
                   NaN
                    NaN
       BD13 5HN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      623591
                          TERRAHOUSE MANANGEMENT LIMITED
       09201714
                     NaN
       NaN
                                     48 BALDRY GARDENS
                                       STREATHAM COMMON
                  LONDON
                   NaN
         UNITED KINGDOM
       SW16 3DJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      177950
                             EAST LINCS SMART REPAIR LTD
       08016004
                     NaN
       NaN
                                         15 LINDEN WAY
                                                    NaN
                  BOSTON
                   NaN
                    NaN
       PE21 9DY
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      21255 
                         25 SPENCER ROAD RTM COMPANY LTD
       07165614
                     NaN
       NaN
                                   104 ALEXANDRIA ROAD
                                                    NaN
                SIDMOUTH
                 DEVON
                    NaN
       EX10 9HG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      624559
                          CASTLEFORD WATER TREATMENT LLP
       OC352830
                     NaN
       NaN
                                         RUSSELL HOUSE
                                        140 HIGH STREET
                 EDGWARE
             MIDDLESEX
                    NaN
        HA8 7LW
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      657174
                                            ODDC LIMITED
       09355223
                     NaN
       NaN
                                   SUITE 34, NEW HOUSE
                                    67-68 HATTON GARDEN
                  LONDON
                   NaN
         UNITED KINGDOM
       EC1N 8JY
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      313028
         FAIRFIELD PROPERTY MANAGEMENT SERVICES (NE) LTD
       08444232
                     NaN
       NaN
              FREDERICK HOUSE DEAN GROUP BUSINESS PARK
                                            BRENDA ROAD
              HARTLEPOOL
                   NaN
                    NaN
       TS25 2BW
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      21130 
                PRIDE SEA EDUCATION & CONSULTING LIMITED
       07783359
                     NaN
       NaN
                                    48 WALTHALL STREET
                                                    NaN
                   CREWE
                   NaN
                    NaN
        CW2 7LA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      167182
       WRITTLE ROAD (CHELMSFORD) RESIDENTS ASSOCIATIO...
       04046956
                     NaN
       NaN
                                               UNIT 1B
                           LITTLE HYDE FARM LITTLE HYDE
       LANE, INGATESTONE
                 ESSEX
                    NaN
        CM4 0DU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      454276
                      SPECIALIST KITCHEN FITTERS LIMITED
       09487433
                     NaN
       NaN
                                       73 ADDISON ROAD
                                                    NaN
         TUNBRIDGE WELLS
                  KENT
         UNITED KINGDOM
        TN2 3GG
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      47226 
                                    JONNY DECKER LIMITED
       07941935
                     NaN
       NaN
                                     29B MONTAGUE ROAD
                                                    NaN
                  LONDON
                   NaN
                    NaN
         E8 2HN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      366302
                                       FLEET MATTERS LTD
       05583300
                     NaN
       NaN
                      3 WOOD END CLOSE, FARNHAM COMMON
                                                 SLOUGH
         BUCKINGHAMSHIRE
                   NaN
                    NaN
        SL2 3RF
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      134560
                              RED EYE CONSULTING LIMITED
       07983909
                     NaN
       NaN
                                     2 CHALFONT SQUARE
                                       OLD FOUNDRY ROAD
                 IPSWICH
                   NaN
                    NaN
        IP4 2AJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      496464
                                               MONTES LP
       SL007667
                     NaN
       NaN
                                       50 LOTHIAN ROAD
                                        FESTIVAL SQUARE
               EDINBURGH
                   NaN
                    NaN
        EH3 9WJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      314358
              SEACONTRACTORS MARITIME PERSONNEL (UK) LTD
       05366317
                     NaN
       NaN
                         WELLINGTON HOUSE FALCON COURT
                         PRESTON FARM INDUSTRIAL ESTATE
        STOCKTON-ON-TEES
             CLEVELAND
                    NaN
       TS18 3TS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      727389
                                             ICENIUM LTD
       09037652
                     NaN
       NaN
                                      5 WESTFIELD ROAD
                                           REGENTS PARK
             SOUTHAMPTON
             HAMPSHIRE
         UNITED KINGDOM
       SO15 4HQ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      729811
                            THROUGH LIFE SUPPORT LIMITED
       07845099
                     NaN
       NaN
                                           CENTRE GATE
                                         COLSTON AVENUE
                 BRISTOL
                   NaN
                    NaN
        BS1 4TR
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      649371
                        THE COOLER WATER COMPANY LIMITED
       05777329
                     NaN
       NaN
                                    THE BLUE FARMHOUSE
                                86-90 CUMBERLAND STREET
              WOODBRIDGE
               SUFFOLK
                    NaN
       IP12 4AE
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      49902 
                                      VOB ENTERPRISE LTD
       09131667
                     NaN
       NaN
                                          9 BROMLEY RD
                                                CATFORD
                  LONDON
                   NaN
         UNITED KINGDOM
        SE6 2TS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      541127
                      BULAU GENERAL CONSTRUCTION LIMITED
       09293723
                     NaN
       NaN
                                25 FRONTFIELD CRESCENT
                                                    NaN
                PLYMOUTH
                   NaN
         UNITED KINGDOM
        PL6 6RY
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      261446
                                ARETHA INTERNATIONAL LTD
       05800646
                     NaN
       NaN
                                     2 HARLESTON CLOSE
                                                 LONDON
                     NaN
                   NaN
                    NaN
         E5 9NH
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      783723
            INTERNATIONAL CORRESPONDENCE SCHOOLS LIMITED
       SC434382
                     NaN
       NaN
                                    BRECKENRIDGE HOUSE
                                 274 SAUCHIEHALL STREET
                 GLASGOW
                   NaN
                    NaN
         G2 3EH
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      386823
                           SIR JOHN SOANE'S MUSEUM TRUST
       07965957
                     NaN
       NaN
                               13 LINCOLN'S INN FIELDS
                                                    NaN
                  LONDON
                   NaN
                    NaN
       WC2A 3BP
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      226889
                        ELMWOOD RESIDENTIAL HOME LIMITED
       01665156
                     NaN
       NaN
                                               ELMWOOD
                                               COLYFORD
                 COLYTON
                 DEVON
                    NaN
       EX24 6QJ
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      725314
          CLIMATE ACTION WEST COMMUNITY INTEREST COMPANY
       06568552
                     NaN
       NaN
                                       GREAT BOW WHARF
                                             BOW STREET
                LANGPORT
              SOMERSET
                    NaN
       TA10 9PN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      86855 
                                         R C & T LIMITED
       09517358
                     NaN
       NaN
                                 32 LAWTON HALL ESTATE
                                    BULWELL HALL ESTATE
                 BULWELL
       NOTTINGHAMSHIRE
                ENGLAND
        NG6 8BL
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      132853
                              ADVICE WISE SOLICITORS LTD
       08058295
                     NaN
       NaN
                              OLYMPIC HOUSE, 3RD FLOOR
                                    28-42 CLEMENTS ROAD
                  ILFORD
                 ESSEX
                ENGLAND
        IG1 1BA
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      847039
                               UPPER BIRNIE FARMS (1987)
       SL001632
                     NaN
       NaN
                                          UPPER BIRNIE
                                                BENHOLM
         KINCARDINESHIRE
                   NaN
                    NaN
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      703525
                    THE STAFFORD OUTDOOR COMPANY LIMITED
       04761613
                     NaN
       NaN
                                        45 MILL STREET
                                                    NaN
                STAFFORD
                STAFFS
                ENGLAND
            NaN
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      633482
                                    CC INTERNATIONAL LTD
       09566910
                     NaN
       NaN
                                    20-22 WENLOCK ROAD
                                                    NaN
                  LONDON
                   NaN
                ENGLAND
         N1 7GU
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
    
      565516
                                      SYCAMORE WOODS LTD
       09456012
                     NaN
       NaN
       CRAVEN HOUSE , GROUND FLOOR 40-44 UXBRIDGE ROAD
                                                 EALING
                  LONDON
                   NaN
                ENGLAND
         W5 2BS
      ...
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
       NaN
    
  

100 rows × 53 columns



In [9]:

    
#  svn checkout http://pygoogle.googlecode.com/svn/trunk pygoogle-read-only
#  python setup.py build
#  sudo python setup.py install

from pygoogle import pygoogle
from time import sleep
from pprint import pprint
#g = pygoogle('! LTD company')
#g.pages = 1
#print '*Found %s results*'%(g.get_result_count())
#g.get_urls()

#print list(smalldata['CompanyName'].values)
compnames = list(smalldata['CompanyName'].values)
#compadds = list(smalldata['RegAddress_AddressLine1'].values)
compadds = list(smalldata['RegAddress_PostCode'].values)
#compadds = [i.split(' ')[0] for i in list(smalldata['RegAddress_PostCode'].values)]


#urls = []
#counter = 0
#for i,j in zip(compnames,compadds):
#    g = pygoogle(i+' contact '+j)
#    g.pages = 1
#    urls.append(g.get_urls())
#    counter += 1
#    sleep(np.random.uniform(5,10))
#print urls

os.chdir(datadir)

urlpklfile="URLs.pkl"
urlfolderpath=os.path.join(datadir,urlpklfile)
if (os.path.exists(urlfolderpath)==True):
    print("Pickle file containing URL data found. Loading it...")
    urls=pickle.load(open(urlfolderpath,'r'))
else:
    print("Fetching company URLs from Google...")
    urls = []
    counter = 0
    for i,j in zip(compnames,compadds):
        g = pygoogle(i+' contact '+j)
        g.pages = 1
        urls.append(g.get_urls())
        if (counter % 10 == 0):
            with open(urlpklfile,'wb') as output:
                pickle.dump(urls, output, pickle.HIGHEST_PROTOCOL)
        counter += 1
        sleep(np.random.uniform(5,10))
    with open(urlpklfile,'wb') as output:
        pickle.dump(urls, output, pickle.HIGHEST_PROTOCOL)

os.chdir(rootdir)

        
#urls = [[u'https://www.facebook.com/andrea.shaw.564', u'https://www.facebook.com/dianne.schultz1', u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/', u'https://classictvhistory.wordpress.com/tag/have-gun-will-travel/', u'http://i.dujour.com/december-print/', u'http://www.greenvillecountybar.org/Gbar_News_PDF/2014/122014.pdf', u'http://dartmouthalumnimagazine.com/class-notes/1970/all', u'http://www.dls.org/pdf/magazine/october_2007_magazine.pdf'], [u'http://www.city-data.com/clackamas-county/D/Delenka-Lane-2.html', u'http://law.justia.com/cases/alaska/supreme-court/2011/', u'https://www.facebook.com/htmody', u'https://www.facebook.com/terry.meyers.5', u'http://www.ciwf.com/media/1141326/outofsight-full-report.pdf', u'http://www.losfoundation.org/wp-content/uploads/2013/06/Donors-2011_2012.pdf', u'http://svcf.org/help/recognition/', u'https://www.ipo.gov.uk/t-tmj/tm-journals/2015-007/owner.html'], [u'https://www.sc.com/uk/contact-us/', u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20130905d.pdf', u'http://www.aim25.ac.uk/cgi-bin/vcdf/detail?coll_id=18442&inst_id=118&nv1=search&nv2=', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://www.hkexnews.hk/listedco/listconews/sehk/2015/0519/LTN20150519338.pdf', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf'], [u'https://www.facebook.com/theoldglovefactorymarketplace', u'https://www.grinnell.edu/about/visit/spaces/old-glove-factory', u'http://en.wikipedia.org/wiki/GlaxoSmithKline', u'http://www.dailykos.com/story/2013/01/06/1163848/-KosAbility-Trying-to-Clean-Out-an-Old-House-with-Arthritis-and-Asthma', u'http://www.cdc.gov/NCEH/publications/books/housing/cha05.htm', u'http://www.slideshare.net/MedlineIndustriesInc/surgical-gloves-a-comprehensive-guide', u'http://www.cpsc.gov/pagefiles/112284/5015.pdf', u'http://ftp.asahq.org/publicationsAndServices/latexallergy.pdf'], [u'http://www.thegsa.co.za/index.php?nav=destination_country&view=28', u'https://www.facebook.com/anna.brass1'], [], [u'http://books.openedition.org/obp/326', u'http://www.hrblock.com/tax-offices/local-offices/#!/en/office-profile/12546', u'http://www.caicv.org/dev/data/fckeditor/cms/file/Quorum_July2010WEB.pdf', u'https://play.google.com/store/apps/details?id=com.mhriley.spendingtracker&hl=en', u'https://www.facebook.com/walter.kajer.1', u'http://duchyofcornwall.org/assets/images/documents/Poundbury_Factsheet_2013.pdf', u'http://www.lihp.org/Content/2011 annual report.pdf', u'http://www.kildare.ie/business/directory/list-companies.asp?Category=Business Services'], [u'http://cera.govt.nz/sites/default/files/common/tc3-residential-rebuild-booklet-A4-20121204.pdf', u'http://www.thomsonlocal.com/Funeral-Directors/in/Surrey/', u'http://www.britishculinaryfederation.co.uk/bcf/wp-content/uploads/2011/06/091124_Culinary_News_December_v6.pdf', u'http://www.hackney.gov.uk/Assets/Documents/ht276.pdf', u'http://www.insightpublications.com.au/pdf_preview/isp-julius-caesar-10-pages.pdf', u'http://www.tripadvisor.co.uk/Hotel_Review-g191252-d491974-Reviews-Trimstone_Manor_Country_House_Hotel-Ilfracombe_Devon_England.html', u'http://www.lincoln.ac.nz/Documents/LEaP/WMK ICRF Final May 2013.pdf', u'http://delvinvillage.com/directory/'], [u'http://www.deloitte.com/', u'http://www.schencksc.com/2015rpctour/', u'http://www.schencksc.com/2013recforum/', u'https://www.linkedin.com/in/jeffreyshlefstein', u'http://www.aicpa.org/BecomeACPA/Pages/InternshipsandCooperativePrograms.aspx', u'http://www.freshbooks.com/accountants/map', u'http://www.mncpa.org/find-a-cpa/cpa-yellow-pages/list.aspx?l=c', u'http://cdn.colorado.gov/cs/Satellite?blobcol=urldata&blobheadername1=Content-Disposition&blobheadername2=Content-Type&blobheadervalue1=inline;+filename="March+28,+2007+Board+Meeting+Minutes.pdf"&blobheadervalue2=application/pdf&blobkey=id&blobtable=MungoBlobs&blobwhere=1251832310203&ssbinary=true'], []]
#urls =[[u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/', u'http://www.ucl.ac.uk/consultants/homepage'], [u'http://www.contactps.ca/', u'https://411.ca/business/profile/7759616'], [u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20090902a.pdf', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://vpr.hkma.gov.hk/pdf/100269/fd_int/fd_int_0613_pt01.pdf', u'http://www.fogl.com/fogl/uploads/companypresentations/annual_report_2012.pdf'], [], [u'https://openaccess.adb.org/bitstream/handle/11540/1651/Volume 28_No 2_2011_06.pdf?sequence=1', u'http://yourtireshopsupply.com/manufacturer/27/grey-pneumatic-corp', u'https://www.facebook.com/people/\xe0\xb8\xa8\xe0\xb8\xb4\xe0\xb8\xa3\xe0\xb8\xb4\xe0\xb8\xa3\xe0\xb8\xb1\xe0\xb8\x95\xe0\xb8\x99\xe0\xb9\x8c-\xe0\xb8\x97\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa7\xe0\xb8\xb1\xe0\xb8\x92\xe0\xb8\x99\xe0\xb9\x8c/100004117395751', u'https://th-th.facebook.com/donnapa.apple', u'https://www.facebook.com/sasesopit.muttamara', u'https://th-th.facebook.com/KLShopbymarie', u'https://th-th.facebook.com/soraya.lomsungnoen.1', u'https://th-th.facebook.com/namthip.bunthong.7'], [u'http://agra-alliance.org/download/53396d7f2a934/', u'https://www.africare.org/wp-content/uploads/2014/08/AFSRNo4_BrysonEley_SuccessStoryGuide_Final_Jan7_2008_updated_June08.pdf'], [u'https://www.clearbooks.co.uk/directory/business', u'https://www.tapa.co.uk/the-tapa-opt-out-ledger.php', u'http://www.dailymail.co.uk/health/article-1330839/Blundering-doctors-leave-mother-terrified-falsely-diagnosing-brain-haemorrhage.html'], [u'http://www.priorygroup.com/location-results/item/the-priory-hospital-glasgow', u'http://www.yell.com/biz/1st-choice-plumbing-and-heating-glasgow-901468909/', u'https://www2.deloitte.com/content/dam/Deloitte/global/Documents/Consumer-Business/gx-cb-global-powers-of-retailing.pdf', u'http://www.rightmove.co.uk/property-for-sale/property-30497721.html', u'http://www.hazelwood.glasgow.sch.uk/', u'https://plus.google.com/+Paranetuklimited', u'http://www.kinningparkcomplex.org/projects-overview/bike-project/', u'https://www.glasgow.gov.uk/CHttpHandler.ashx?id=14911&p=0'], [u'http://www.scleeaccountant.com/', u'http://www.192.com/places/sk/sk8-1/sk8-1nq/', u'https://www.icpas.org/hc-career-center.aspx?id=21550', u'https://www.linkedin.com/pub/leona-crouch/26/b42/b17', u'http://www.burkertvaluation.com/wp-content/uploads/2014/04/Rpb-Vitae_General.pdf', u'http://www.alec.co.uk/cvtips/examgrcv.htm', u'http://www.chaos.umd.edu/misc/origplates.html', u'http://www.atiner.gr/bio/Syrrakos.doc'], [u'https://uk.linkedin.com/pub/david-wasilewski/27/143/368']]


# TO USE A HAND-PICKED SET OF URLS TO AVOID REPEAT REQUESTS TO GOOGLE, WHICH GET YOU BLOCKED
urls = [[u'http://www.192.com/atoz/business/brentwood/financial--advisers--(independent)/'], [u'http://www.plantmethods.com/content/10/October/2014', u'http://www.plantmethods.com/content?page=2&itemsPerPage=25'], [u'https://www.sc.com/uk/contact-us/', u'https://www.sc.com/en/contact-us/', u'https://www.sc.com/je/contact-us/index.html', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20130905d.pdf', u'https://www.sc.com/hk/investor-relations/_documents/en/news/20140520b.pdf', u'http://www.bloomberg.com/research/stocks/people/person.asp?personId=8307423&ticker=STAN:LN', u'http://www.sebi.gov.in/dp/stdchtdrhp.pdf', u'http://www.hkexnews.hk/listedco/listconews/sehk/2015/0519/LTN20150519338.pdf'], [u'http://www.nhs.uk/Services/Trusts/Pharmacies/DefaultView.aspx?id=89768', u'http://www.boots.com/'], [], [], [u'https://www.xero.com/', u'http://www.sage.com/'], [u'http://www.mastercard.us/', u'http://www.baxterstorey.co.uk/'], [u'http://www.192.com/places/sk/sk8-1/sk8-1nq/', u'http://www.ey.com/', u'http://www.grantthornton.com/'], []]


#print len(urls)
#pprint(urls)

#filteredurls = urls[:]
#for count,i in enumerate(filteredurls[:]):
#    for j in i:
#        print j
#        if ('contact' not in j):
#            filteredurls[count].remove(j)
#            print "NOT FOUND"
            #print j
            #print filteredurls[count]
#print filteredurls

# This one exceeds maximum recursion
#def empty(seq):
#    try:
#        return all(map(empty, seq))
#    except TypeError:
#        return False

def empty(seq):
    """Check if a nested list (list of lists) is completely empty, if so return 'True'"""
    containslist = []
    for i in range(0,len(seq)-1):
        if seq[i]:
            containslist.append(False)
        else:
            containslist.append(True)
    if (False in containslist):
        return False
    else:
        return True

def filtering(initem):
    """ Check if string 'contact' is in URL, if so split by it and keep first part, else return empty list"""
    if ('contact' in initem):
        return initem.split('contact')[0]
    else:
        return []
    
filteredurls = [np.nan]*len(urls)
for i in range(0,len(urls)-1):
    filteredurls[i] = [filtering(j) for j in urls[i]]
    if empty(filteredurls[i]):
    #if not filteredurls[i]:
        filteredurls[i] = np.nan
#pprint(filteredurls)

#filteredurls = urls[:]
#for i,j in enumerate(urls):
#    toremove = [k for k in urls[i] if 'contact' not in urls[i]]
#    for l in j:
#        if(j in toremove):
#            filteredurls[i].remove(j)
#print filteredurls

d = {'CompanyName' : pan.Series(compnames), 'CompanyAddress1' : pan.Series(compadds), 'URLs' : pan.Series(filteredurls)}

dfurls = pan.DataFrame(d)
dfurls

#urls = [pygoogle(i).get_urls()[0] for i in list(smalldata['CompanyName'].values)]
#print urls
#smalldata['WebURL'] = Series([pygoogle(i).get_urls()[0] for i in data['CompanyName']], index=smalldata.index)
#compnames = smalldata.iterrows()[1]
#print compnames


#for i in range(0,len(smalldata)-1):









    



Pickle file containing URL data found. Loading it...






    Out[9]:






  
    
      
      CompanyAddress1
      CompanyName
      URLs
    
  
  
    
      0 
         M8 4HG
                                        EARO ESTATES LTD
                                                     NaN
    
    
      1 
        WD6 3EW
                               AMERICAN EXPRESSO LIMITED
                                                     NaN
    
    
      2 
       WC2H 8LS
                                               SIXTEEN25
       [https://www.sc.com/uk/, https://www.sc.com/en...
    
    
      3 
         N7 9DP
                               DATASAFE SERVICES LIMITED
                                                     NaN
    
    
      4 
       BT25 1RU
                                          BALLYNARIS LTD
                                                     NaN
    
    
      5 
       EX13 5AD
                                SUBSEA INSPECTION SW LTD
                                                     NaN
    
    
      6 
       BH24 1PN
                  SOUTHWOOD BUILDING CONTRACTORS LIMITED
                                                     NaN
    
    
      7 
        NG5 1FX
                                 MISS B CREATIVE LIMITED
                                                     NaN
    
    
      8 
       WA13 0RX
                      S D C CONSULTANCY SERVICES LIMITED
                                                     NaN
    
    
      9 
        BA4 6BR
                          APPLECORE DEVELOPMENTS LIMITED
                                                     NaN
    
    
      10
       RH14 9SG
                  KMJ PROPERTY (TUNBRIDGE WELLS) LIMITED
                                                     NaN
    
    
      11
        B49 6EL
                        PRC PLUMBING AND HEATING LIMITED
                                                     NaN
    
    
      12
            NaN
                                            UMSI LIMITED
                                                     NaN
    
    
      13
       SE15 5GG
                                             AZA-MAB LTD
                                                     NaN
    
    
      14
        BT4 3ET
                              HEXAGON ASSOCIATES LIMITED
                                                     NaN
    
    
      15
       WC1R 4JS
                             ARROW GLOBAL MASSEY LIMITED
                                                     NaN
    
    
      16
            NaN
                                   GILESWOODFORD LIMITED
                                                     NaN
    
    
      17
       BS40 5SH
                           HIGHWOOD HOUSE PUBLISHING LTD
                                                     NaN
    
    
      18
        ME4 5PT
                            DAZZITTO PHOTOGRAPHY LIMITED
                                                     NaN
    
    
      19
        HR2 9EW
                      EASTHOLME COURT MANAGEMENT LIMITED
                                                     NaN
    
    
      20
       WS13 6AA
                                    LIBERATION MEDIA LTD
                                                     NaN
    
    
      21
       BT17 0AJ
                                        A M AGENCIES LTD
                                                     NaN
    
    
      22
        WF8 2UR
                               FAA INSTALLATIONS LIMITED
                                                     NaN
    
    
      23
        SK9 7LF
                    COFFEE CULTURE CAFE & EATERY LIMITED
                                                     NaN
    
    
      24
       CF23 8AA
                     ADERYN BUILDING CONSULTANCY LIMITED
                                                     NaN
    
    
      25
        KA9 1PB
                                BUZZWORKS HOTELS LIMITED
                                                     NaN
    
    
      26
        WF5 0SP
                                       JORDAN-HOWELL LTD
                                                     NaN
    
    
      27
       BT78 5EJ
                                  BROOMCO (3113) LIMITED
                                                     NaN
    
    
      28
        LE7 2HB
                                     DOREL (EUROPE) LTD.
                                                     NaN
    
    
      29
       BD13 5HN
                                OLD SPOT BREWERY LIMITED
                                                     NaN
    
    
      ...
      ...
      ...
      ...
    
    
      70
       SW16 3DJ
                          TERRAHOUSE MANANGEMENT LIMITED
                                                     NaN
    
    
      71
       PE21 9DY
                             EAST LINCS SMART REPAIR LTD
                                                     NaN
    
    
      72
       EX10 9HG
                         25 SPENCER ROAD RTM COMPANY LTD
                                                     NaN
    
    
      73
        HA8 7LW
                          CASTLEFORD WATER TREATMENT LLP
                                                     NaN
    
    
      74
       EC1N 8JY
                                            ODDC LIMITED
                                                     NaN
    
    
      75
       TS25 2BW
         FAIRFIELD PROPERTY MANAGEMENT SERVICES (NE) LTD
                                                     NaN
    
    
      76
        CW2 7LA
                PRIDE SEA EDUCATION & CONSULTING LIMITED
                                                     NaN
    
    
      77
        CM4 0DU
       WRITTLE ROAD (CHELMSFORD) RESIDENTS ASSOCIATIO...
                                                     NaN
    
    
      78
        TN2 3GG
                      SPECIALIST KITCHEN FITTERS LIMITED
                                                     NaN
    
    
      79
         E8 2HN
                                    JONNY DECKER LIMITED
                                                     NaN
    
    
      80
        SL2 3RF
                                       FLEET MATTERS LTD
                                                     NaN
    
    
      81
        IP4 2AJ
                              RED EYE CONSULTING LIMITED
                                                     NaN
    
    
      82
        EH3 9WJ
                                               MONTES LP
                                                     NaN
    
    
      83
       TS18 3TS
              SEACONTRACTORS MARITIME PERSONNEL (UK) LTD
                                                     NaN
    
    
      84
       SO15 4HQ
                                             ICENIUM LTD
                                                     NaN
    
    
      85
        BS1 4TR
                            THROUGH LIFE SUPPORT LIMITED
                                                     NaN
    
    
      86
       IP12 4AE
                        THE COOLER WATER COMPANY LIMITED
                                                     NaN
    
    
      87
        SE6 2TS
                                      VOB ENTERPRISE LTD
                                                     NaN
    
    
      88
        PL6 6RY
                      BULAU GENERAL CONSTRUCTION LIMITED
                                                     NaN
    
    
      89
         E5 9NH
                                ARETHA INTERNATIONAL LTD
                                                     NaN
    
    
      90
         G2 3EH
            INTERNATIONAL CORRESPONDENCE SCHOOLS LIMITED
                                                     NaN
    
    
      91
       WC2A 3BP
                           SIR JOHN SOANE'S MUSEUM TRUST
                                                     NaN
    
    
      92
       EX24 6QJ
                        ELMWOOD RESIDENTIAL HOME LIMITED
                                                     NaN
    
    
      93
       TA10 9PN
          CLIMATE ACTION WEST COMMUNITY INTEREST COMPANY
                                                     NaN
    
    
      94
        NG6 8BL
                                         R C & T LIMITED
                                                     NaN
    
    
      95
        IG1 1BA
                              ADVICE WISE SOLICITORS LTD
                                                     NaN
    
    
      96
            NaN
                               UPPER BIRNIE FARMS (1987)
                                                     NaN
    
    
      97
            NaN
                    THE STAFFORD OUTDOOR COMPANY LIMITED
                                                     NaN
    
    
      98
         N1 7GU
                                    CC INTERNATIONAL LTD
                                                     NaN
    
    
      99
         W5 2BS
                                      SYCAMORE WOODS LTD
                                                     NaN
    
  

100 rows × 3 columns



In [10]:

    
import re
from mechanize import Browser

# http://stackoverflow.com/questions/1011975/how-to-get-links-on-a-webpage-using-mechanize-and-open-those-links
def findAboutUs(inputlink):
    """Given an initial (hopefully, homepage) URL, look for an 'About Us' link, if not found just return initial URL."""
    
    if (inputlink == np.nan):
        return np.nan
    
    #print inputlink
    br = Browser()
    br.open(inputlink)

    aboutuslinks = []
    # br.links(url_regex="about")
    # br.links(text_regex="About( us)?")
    for link in br.links(text_regex="About"):
        #print inputlink, link.url
        aboutuslinks.append(link)
        #br.follow_link(link)  # takes EITHER Link instance OR keyword args
        #br.back()

    #print aboutuslinks

    #  http://stackoverflow.com/questions/10994251/mechanize-urllib-beautifulsoup-relative-paths
    for i,j in enumerate(aboutuslinks):
        """Mechanize often returns relative links, split into .base_url and .url  We join them -if necessary- here."""
        domain = re.search('(http:\/\/.*\.\D+?|https:\/\/.*\.\D+?)\/',j.base_url.strip())
        if domain:
            domain = domain.group(1)
        if re.search('mailto',j.url.strip()) != None:
            pass
        elif re.search('(http:\/\/.*\.\D+?|https:\/\/.*\.\D+?)\/',j.url.strip()) != None:
            u = j.url.strip()#.encode('utf8')
        elif re.search('^/',j.url.strip()) != None:
            u = domain+j.url.strip()#.encode('utf8')
        else:
            u = domain+'/'+j.url.strip()#.encode('utf8')
        aboutuslinks[i] = u
       
    # Some non-About Us links somehow still make it here, filter them out by requiring an 'about' in the URL
    #print aboutuslinks
    aboutuslinks = [i for i in aboutuslinks if 'about' in i]
    #print aboutuslinks

    # If multiple 'About Us' links found (sometimes duplicates), take the first one only
    if (aboutuslinks and isinstance(aboutuslinks, list)):
        aboutuslink = aboutuslinks[0]
    else:
        aboutuslink = aboutuslinks

    # If no 'About us' link is found return initial (input) link
    if aboutuslink:
        return aboutuslink
    else:
        return inputlink


#print findAboutUs("https://www.sc.com/uk/")
print findAboutUs("http://www.growthintel.com")









    



http://www.growthintel.com/about-us/



In [11]:

    
#from lxml import html
#import requests
#page = requests.get('https://www.sc.com/uk/')
#tree = html.fromstring(page.text)
#print tree

#from BeautifulSoup import BeautifulSoup
#import bs4

from bs4 import BeautifulSoup
import urllib

def retrieveText(inputlink):
    """Fetch the text from a link to an HTML file"""
    
    if (inputlink == np.nan):
        return np.nan
    
    html = urllib.urlopen(inputlink).read()
    soup = BeautifulSoup(html)
    texts = soup.findAll(text=True)

    # http://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text
    #def visible(element):
    #    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
    #        return False
    #    elif element.parent.name isinstance(element, Comment):
    #    #elif re.match('<!--.*-->', str(element)):
    #        return False
    #    return True
    
    #visible_texts = filter(visible, texts)
    
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText()
    
    return visible_text


#print retrieveText('https://www.sc.com/uk/about-us/index.html')
print retrieveText('http://www.growthintel.com/about-us/')









    















Boost B2B marketing conversion rates with smart data.


Home
Free Demo
Case Studies
Resources
We’re hiring!
 













About us







Growth Intelligence is a fast-growing business based in Canary Wharf. Founded three years ago by an ex journalist and a military computer scientist, the company uses web-scale ‘open source’ data and machine learning to deliver Predictive Marketing to a number of major blue-chip customers and some newer entrants.
The software tracks the performance and activity of all the companies in the economy in real-time using their data ‘footprint’. By learning the common patterns of behaviour at companies known to have a ‘need’ for a product at a certain time (because they bought a product at that time) it is able to predict which companies are more and less likely to have a need for a product today.
The system works like a ‘recommendation engine’ to predicatively segment businesses into ‘needs categories’ using open big data. This system is more flexible, sophisticated and much more likely to create useful groupings of customers than a simply dividing by revenue, credit score or SIC code. It could be used in conjunction of simpler revenue-split methods to assign businesses to account management teams with expertise in meeting a specific need.
 Our clients have seen up to 14x performance boosts in marketing ROI. Try Growth Intelligence today and see what we can do for your business.

Tom Gatten
CEO
Tom has worked at the BBC World Service, the Times of India and business intelligence firm Screen Digest. Tom graduated with the top first in his year from Oxford. He defined the original algorithms and the theoretical principles behind the core software of Growth Intelligence and now leads company strategy.



Prashant Majmudar
CTO
Prashant draws on six years delivering major defence and security software projects to government and corporations at BAE Systems Detica. Prior to Detica, Prashant worked for three years at DSTL, the government defence research agency. Prashant holds a first-class degree in Physics from Warwick.



Alex Mitchell
Data Team Lead
Alex has worked at both BAE Systems and most recently for five years at BAE Systems Detica as a Principal Consultant before joining Growth Intelligence to lead the data science team. Alex holds a First-class degree in Computer Science from Nottingham University and a Masters in Machine Learning and Data Mining from Bristol University, where he was awarded the top distinction.



Hemel Popat
Advisor
Hemel oversees Growth Intelligence’s technical strategy. Hemel has a background as a Technical Architect at the Ministry of Justice, Technical Authority for Central Government and Principal Consultant at PA Consulting Group and NatWest Bank.



Sam Stephens
Software Developer



Chiara Mureddu
Operations Manager



Dylan Barth
Software Developer
Prior to joining the Growth Intelligence team, Dylan co-founded the Foundation for Learning Equality, a startup nonprofit with the ambitious goal of bridging the digital divide by bringing free online learning materials to the offline world. He holds a B. S. in Cognitive Science with a specialization in Human-Computer Interaction from UC San Diego.



Hal Varian
Advisory Board
Hal R. Varian is the chief economist at Google. Since 2002 he has been involved in many aspects of the company, including auction design, econometric analysis, finance, corporate strategy and public policy. Hal had a major role in analysing and refining Google’s ad-auction system which contributed so much to Google’s success. Hal has been at the forefront of innovation at the crossroads between finance and web technology for many years.



JP Rangaswami
Advisory Board
JP Rangaswami is the Chief Scientist at Salesforce.com. He reports directly to Marc Benioff (CEO) and is responsible for Salesforce’s product strategy, in particular innovation through Salesforce Apps and Force.com.



Paul Johnson
Advisory Board
Paul is a Fellow of the Institute of Chartered Accountants in England and Wales. He spent 38 years with KPMG Europe LLP, becoming a Partner in 1988 and has extensive experience of working with companies in a variety of different industries in both the listed and private sectors. For the last 12 years he was Chairman of KPMG London and Eastern Counties and a member of KPMG’s UK Markets Executive.



Jens Lapinski
Advisory Board
Co-founder at Forward Labs, a London-based startup studio. Director at Techstars London from January 2014 onwards.



Jeremy Silver
Advisory Board
Jeremy Silver is an entrepreneur, digital media adviser and thought-leader. He is Executive Chairman of Semetric (real-time analytics for the entertainment industry – check out musicmetric) and Chair of MusicGlue (online ticketing and services for artists).













Recent Tweets 

Follow @growthintel


 
 

PagesHome
About us
Free Demo
How does Gi make marketing smarter? Meet the Buyer Matrix™
Case Studies
Careers
Resources
Press Kit
Blog
Contact us
Privacy Policy
 

Contact us jobs@growthintel.comLevel 42, 1 Canada Square, E14 5AA, Londoninfo@growthintel.comSales: +44 (0)20 3725 7575
Office: +44 (0)20 3668 3664
 

Follow us 









©2014 Growth Intelligence® (Pelucid Ltd.) - Smart B2B Lead Generation We're hiring!



In [12]:

    
def createDescription(inputlink):
    """Link the findAboutUs() and retrieveText() functions to obtain company description from input link"""
    if (isinstance(inputlink,list)):
        inputlink = inputlink[0]
        
    if (inputlink == np.nan):
        return np.nan
    else:
        link = findAboutUs(inputlink)
        text = retrieveText(link)
    return text
        #if (isinstance(inputlinks,list)):
        #    link = findAboutUs("http://portent.io")
        #    link = findAboutUs(inputlinks[0])
        #else:
        #    link = findAboutUs(inputlinks)
        #text = retrieveText(link)
        #return text

#print createDescription(np.nan)
#print createDescription("https://www.sc.com/uk/")
testlink = "http://www.growthintel.com"
print createDescription(testlink)









    















Boost B2B marketing conversion rates with smart data.


Home
Free Demo
Case Studies
Resources
We’re hiring!
 













About us







Growth Intelligence is a fast-growing business based in Canary Wharf. Founded three years ago by an ex journalist and a military computer scientist, the company uses web-scale ‘open source’ data and machine learning to deliver Predictive Marketing to a number of major blue-chip customers and some newer entrants.
The software tracks the performance and activity of all the companies in the economy in real-time using their data ‘footprint’. By learning the common patterns of behaviour at companies known to have a ‘need’ for a product at a certain time (because they bought a product at that time) it is able to predict which companies are more and less likely to have a need for a product today.
The system works like a ‘recommendation engine’ to predicatively segment businesses into ‘needs categories’ using open big data. This system is more flexible, sophisticated and much more likely to create useful groupings of customers than a simply dividing by revenue, credit score or SIC code. It could be used in conjunction of simpler revenue-split methods to assign businesses to account management teams with expertise in meeting a specific need.
 Our clients have seen up to 14x performance boosts in marketing ROI. Try Growth Intelligence today and see what we can do for your business.

Tom Gatten
CEO
Tom has worked at the BBC World Service, the Times of India and business intelligence firm Screen Digest. Tom graduated with the top first in his year from Oxford. He defined the original algorithms and the theoretical principles behind the core software of Growth Intelligence and now leads company strategy.



Prashant Majmudar
CTO
Prashant draws on six years delivering major defence and security software projects to government and corporations at BAE Systems Detica. Prior to Detica, Prashant worked for three years at DSTL, the government defence research agency. Prashant holds a first-class degree in Physics from Warwick.



Alex Mitchell
Data Team Lead
Alex has worked at both BAE Systems and most recently for five years at BAE Systems Detica as a Principal Consultant before joining Growth Intelligence to lead the data science team. Alex holds a First-class degree in Computer Science from Nottingham University and a Masters in Machine Learning and Data Mining from Bristol University, where he was awarded the top distinction.



Hemel Popat
Advisor
Hemel oversees Growth Intelligence’s technical strategy. Hemel has a background as a Technical Architect at the Ministry of Justice, Technical Authority for Central Government and Principal Consultant at PA Consulting Group and NatWest Bank.



Sam Stephens
Software Developer



Chiara Mureddu
Operations Manager



Dylan Barth
Software Developer
Prior to joining the Growth Intelligence team, Dylan co-founded the Foundation for Learning Equality, a startup nonprofit with the ambitious goal of bridging the digital divide by bringing free online learning materials to the offline world. He holds a B. S. in Cognitive Science with a specialization in Human-Computer Interaction from UC San Diego.



Hal Varian
Advisory Board
Hal R. Varian is the chief economist at Google. Since 2002 he has been involved in many aspects of the company, including auction design, econometric analysis, finance, corporate strategy and public policy. Hal had a major role in analysing and refining Google’s ad-auction system which contributed so much to Google’s success. Hal has been at the forefront of innovation at the crossroads between finance and web technology for many years.



JP Rangaswami
Advisory Board
JP Rangaswami is the Chief Scientist at Salesforce.com. He reports directly to Marc Benioff (CEO) and is responsible for Salesforce’s product strategy, in particular innovation through Salesforce Apps and Force.com.



Paul Johnson
Advisory Board
Paul is a Fellow of the Institute of Chartered Accountants in England and Wales. He spent 38 years with KPMG Europe LLP, becoming a Partner in 1988 and has extensive experience of working with companies in a variety of different industries in both the listed and private sectors. For the last 12 years he was Chairman of KPMG London and Eastern Counties and a member of KPMG’s UK Markets Executive.



Jens Lapinski
Advisory Board
Co-founder at Forward Labs, a London-based startup studio. Director at Techstars London from January 2014 onwards.



Jeremy Silver
Advisory Board
Jeremy Silver is an entrepreneur, digital media adviser and thought-leader. He is Executive Chairman of Semetric (real-time analytics for the entertainment industry – check out musicmetric) and Chair of MusicGlue (online ticketing and services for artists).













Recent Tweets 

Follow @growthintel


 
 

PagesHome
About us
Free Demo
How does Gi make marketing smarter? Meet the Buyer Matrix™
Case Studies
Careers
Resources
Press Kit
Blog
Contact us
Privacy Policy
 

Contact us jobs@growthintel.comLevel 42, 1 Canada Square, E14 5AA, Londoninfo@growthintel.comSales: +44 (0)20 3725 7575
Office: +44 (0)20 3668 3664
 

Follow us 









©2014 Growth Intelligence® (Pelucid Ltd.) - Smart B2B Lead Generation We're hiring!



In [13]:

    
##dfurls = dfurls.drop('CompanyDescription', 1)
#print dfurls[ pan.notnull(dfurls['URLs']) ]
#dfurls['AboutUsURL'] = dfurls['URLs'].apply(lambda x: findAboutUs(x))



In [14]:

    
#dfurls = dfurls.drop('AboutUsURL', 1)
#dfurls['CompanyDescription'] = dfurls['URLs'].apply(lambda x: createDescription(x))
#dfurls
#print dfurls.ix[dfurls['CompanyName'] == 'STANDARD CHARTERED NOMINEES LIMITED', 'CompanyDescription'].values


#os.chdir(datadir)
#descpklfile="descriptions.pkl"
#descfolderpath=os.path.join(datadir,descpklfile)
#if (os.path.exists(descfolderpath)==True):
#    print("Pickle file containing company descriptions data found. Loading it...")
#    dfurls=pickle.load(open(descfolderpath,'r'))
#else:
#    print("Fetching company descriptions...")
#    dfurls['CompanyDescription'] = dfurls['URLs'].apply(lambda x: createDescription(x))
#    with open(descpklfile,'wb') as output:
#        pickle.dump(dfurls, output, pickle.HIGHEST_PROTOCOL)
#os.chdir(rootdir)       
#dfurls



In [13]:

    
AboutUsURLs = [["McKinsey & Company", "http://www.mckinsey.com/about_us"], ["The White Company", "http://www.thewhitecompany.com/help/our-story/"], ["Marks & Spencer", "http://corporate.marksandspencer.com/aboutus"], ["Kids Company", "http://www.kidsco.org.uk/about-us"], ["Thunderhead", "http://www.thunderhead.com/what-we-do/about-us/"], ["Aston Martin", "https://www.astonmartin.com/en/company/about-us"], ["Bicester Village", "http://www.bicestervillage.com/en/company/about-us"], ["Solarcentury", "http://www.solarcentury.com/uk/about-solarcentury/"], ["Student Loans Company", "http://www.slc.co.uk/about-us.aspx"], ["The Stationers' Company", "https://stationers.org/about.html"], ["Royal Shakespeare Company", "http://www.rsc.org.uk/about-us/"], ["Snell", "http://www.snellgroup.com/company/about-us/"], ["The Wax Chandlers Company", "http://www.waxchandlers.org.uk/about-us/index.php"], ["Expeditors", "http://www.expeditors.com/our-company/about-us.asp"], ["The Carbon Neutral Company", "http://www.carbonneutral.com/about-us"], ["The Pewterers' Company", "http://www.pewterers.org.uk/the_company/aboutus.html"], ["Vauxhall", "http://www.vauxhall.co.uk/about-vauxhall/about-us/company.html"], ["EE", "http://ee.co.uk/our-company/about-ee"], ["Candoco Dance Company", "http://www.candoco.co.uk/about-us/"], ["Victrex", "http://www.victrex.com/en/company/about-us"], ["Ensus", "http://www.ensus.co.uk/Company/About_us/"], ["Anglian Water", "http://www.anglianwater.co.uk/about-us/"], ["The Cheque and Credit Clearing Company", "http://www.chequeandcredit.co.uk/about_us/"], ["Vodafone", "http://www.vodafone.co.uk/about-us/company-history/"], ["People 1st","http://www.people1sttraining.co.uk/about-us"], ["Starbucks","http://www.starbucks.co.uk/about-us"], ["Merlin Entertainments","http://www.merlinentertainments.biz/about-us"], ["Bloomsbury Publishing","http://www.bloomsbury.com/uk/company/about-us/"], ["Alcatel One Touch","http://www.alcatelonetouch.com/global-en/company/aboutus.html"], ["Masons Kings","http://masonkings.jd-dealer.co.uk/About-us/Our-Company"], ["Oxford Bus Company","http://www.oxfordbus.co.uk/about-us/"], ["Patient.co.uk","http://www.patient.co.uk/about-us"], ["Bootstrap Company","http://www.bootstrapcompany.co.uk/about-us/"], ["Fusion Furniture","http://www.fusionfurniturecompany.co.uk/about.php"], ["Siemens","http://www.siemens.co.uk/en/about_us/"], ["Bosch UK","http://www.bosch.co.uk/en/uk/about_bosch_home_2/about-bosch-in-great-britain.php#"], ["Qualcomm","https://www.qualcomm.com/company/about"], ["Apple","https://www.apple.com/about/"], ["Mercedes-Benz UK","http://www2.mercedes-benz.co.uk/content/unitedkingdom/mpc/mpc_unitedkingdom_website/en/home_mpc/passengercars/home/passenger_cars_world/about_us.html"], ["IBM UK","http://www.ibm.com/ibm/uk/en/"], ["Google","https://www.google.co.uk/about/"], ["Intel","http://www.intel.com/content/www/us/en/company-overview/company-overview.html"], ["ebay","http://pages.ebay.co.uk/aboutebay.html"], ["WebMD","http://www.webmd.com/about-webmd-policies/about-who-we-are"], ["Growth Intelligence","http://www.growthintel.com/about-us/"] ]

#pprint(AboutUsURLs)
print len(AboutUsURLs)

cnames = [i for i,j in AboutUsURLs]
caboutusurls = [j for i,j in AboutUsURLs]
#print cnames

descdict = {'CompanyName' : pan.Series(cnames), 'AboutUsURL' : pan.Series(caboutusurls)}
descdf = pan.DataFrame(descdict)
descdf









    



45






    Out[13]:






  
    
      
      AboutUsURL
      CompanyName
    
  
  
    
      0 
                        http://www.mckinsey.com/about_us
                           McKinsey & Company
    
    
      1 
          http://www.thewhitecompany.com/help/our-story/
                            The White Company
    
    
      2 
            http://corporate.marksandspencer.com/aboutus
                              Marks & Spencer
    
    
      3 
                       http://www.kidsco.org.uk/about-us
                                 Kids Company
    
    
      4 
         http://www.thunderhead.com/what-we-do/about-us/
                                  Thunderhead
    
    
      5 
         https://www.astonmartin.com/en/company/about-us
                                 Aston Martin
    
    
      6 
       http://www.bicestervillage.com/en/company/abou...
                             Bicester Village
    
    
      7 
       http://www.solarcentury.com/uk/about-solarcent...
                                 Solarcentury
    
    
      8 
                      http://www.slc.co.uk/about-us.aspx
                        Student Loans Company
    
    
      9 
                       https://stationers.org/about.html
                      The Stationers' Company
    
    
      10
                         http://www.rsc.org.uk/about-us/
                    Royal Shakespeare Company
    
    
      11
             http://www.snellgroup.com/company/about-us/
                                        Snell
    
    
      12
       http://www.waxchandlers.org.uk/about-us/index.php
                    The Wax Chandlers Company
    
    
      13
       http://www.expeditors.com/our-company/about-us...
                                   Expeditors
    
    
      14
                   http://www.carbonneutral.com/about-us
                   The Carbon Neutral Company
    
    
      15
       http://www.pewterers.org.uk/the_company/aboutu...
                       The Pewterers' Company
    
    
      16
       http://www.vauxhall.co.uk/about-vauxhall/about...
                                     Vauxhall
    
    
      17
                    http://ee.co.uk/our-company/about-ee
                                           EE
    
    
      18
                      http://www.candoco.co.uk/about-us/
                        Candoco Dance Company
    
    
      19
              http://www.victrex.com/en/company/about-us
                                      Victrex
    
    
      20
                http://www.ensus.co.uk/Company/About_us/
                                        Ensus
    
    
      21
                 http://www.anglianwater.co.uk/about-us/
                                Anglian Water
    
    
      22
              http://www.chequeandcredit.co.uk/about_us/
       The Cheque and Credit Clearing Company
    
    
      23
       http://www.vodafone.co.uk/about-us/company-his...
                                     Vodafone
    
    
      24
             http://www.people1sttraining.co.uk/about-us
                                   People 1st
    
    
      25
                     http://www.starbucks.co.uk/about-us
                                    Starbucks
    
    
      26
            http://www.merlinentertainments.biz/about-us
                        Merlin Entertainments
    
    
      27
          http://www.bloomsbury.com/uk/company/about-us/
                        Bloomsbury Publishing
    
    
      28
       http://www.alcatelonetouch.com/global-en/compa...
                            Alcatel One Touch
    
    
      29
       http://masonkings.jd-dealer.co.uk/About-us/Our...
                                 Masons Kings
    
    
      30
                    http://www.oxfordbus.co.uk/about-us/
                           Oxford Bus Company
    
    
      31
                       http://www.patient.co.uk/about-us
                                Patient.co.uk
    
    
      32
             http://www.bootstrapcompany.co.uk/about-us/
                            Bootstrap Company
    
    
      33
       http://www.fusionfurniturecompany.co.uk/about.php
                             Fusion Furniture
    
    
      34
                   http://www.siemens.co.uk/en/about_us/
                                      Siemens
    
    
      35
       http://www.bosch.co.uk/en/uk/about_bosch_home_...
                                     Bosch UK
    
    
      36
                  https://www.qualcomm.com/company/about
                                     Qualcomm
    
    
      37
                            https://www.apple.com/about/
                                        Apple
    
    
      38
       http://www2.mercedes-benz.co.uk/content/united...
                             Mercedes-Benz UK
    
    
      39
                           http://www.ibm.com/ibm/uk/en/
                                       IBM UK
    
    
      40
                         https://www.google.co.uk/about/
                                       Google
    
    
      41
       http://www.intel.com/content/www/us/en/company...
                                        Intel
    
    
      42
                  http://pages.ebay.co.uk/aboutebay.html
                                         ebay
    
    
      43
       http://www.webmd.com/about-webmd-policies/abou...
                                        WebMD
    
    
      44
                    http://www.growthintel.com/about-us/
                          Growth Intelligence



In [14]:

    
os.chdir(datadir)
descpklfile="descriptions.pkl"
descfolderpath=os.path.join(datadir,descpklfile)
if (os.path.exists(descfolderpath)==True):
    print("Pickle file containing company descriptions data found. Loading it...")
    descdf=pickle.load(open(descfolderpath,'r'))
else:
    print("Fetching company descriptions...")
    descdf['CompanyDescription'] = descdf['AboutUsURL'].apply(lambda x: retrieveText(x))
    with open(descpklfile,'wb') as output:
        pickle.dump(descdf, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)

descdf









    



Pickle file containing company descriptions data found. Loading it...






    Out[14]:






  
    
      
      AboutUsURL
      CompanyName
      CompanyDescription
    
  
  
    
      0 
                        http://www.mckinsey.com/about_us
                           McKinsey & Company
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip main navi...
    
    
      1 
          http://www.thewhitecompany.com/help/our-story/
                            The White Company
       \n\n\n\n\n\n\n\n\n\n\nBedroom\n\n\n\n\nShop Be...
    
    
      2 
            http://corporate.marksandspencer.com/aboutus
                              Marks & Spencer
       \n\n\n\n\n\n\nmenu\nback\n\nsearch\nstore find...
    
    
      3 
                       http://www.kidsco.org.uk/about-us
                                 Kids Company
       \n\n\n\n\n\n\n\n\n\nHome\nAbout Us\nOur Work\n...
    
    
      4 
         http://www.thunderhead.com/what-we-do/about-us/
                                  Thunderhead
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
    
    
      5 
         https://www.astonmartin.com/en/company/about-us
                                 Aston Martin
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWe use cookies o...
    
    
      6 
       http://www.bicestervillage.com/en/company/abou...
                             Bicester Village
       \n\n\n\n\n\n\n\n\n\n\nBicester Village • Chic ...
    
    
      7 
       http://www.solarcentury.com/uk/about-solarcent...
                                 Solarcentury
       \n\n\n\n\n\n\n\n\nUnited KingdomUK\nBeneluxNL\...
    
    
      8 
                      http://www.slc.co.uk/about-us.aspx
                        Student Loans Company
       \n\n\n\n\n\n\n\n\n\nJump to Content [Accesskey...
    
    
      9 
                       https://stationers.org/about.html
                      The Stationers' Company
        \n\n\n\n\n\n\nHomeHiring Stationers' HallHire...
    
    
      10
                         http://www.rsc.org.uk/about-us/
                    Royal Shakespeare Company
       \n\n\n\n\n\n\n\n\nAccept and Close\n\r\n    We...
    
    
      11
             http://www.snellgroup.com/company/about-us/
                                        Snell
       \n\n\n\n\n\n\n\n\n\n\n\nSolutions\n\n\n\nLive ...
    
    
      12
       http://www.waxchandlers.org.uk/about-us/index.php
                    The Wax Chandlers Company
       \n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout us\nThe Wax ...
    
    
      13
       http://www.expeditors.com/our-company/about-us...
                                   Expeditors
       \n\n\n\n\n\n\n\n\n\n\n\nOur Company\n\nAbout U...
    
    
      14
                   http://www.carbonneutral.com/about-us
                   The Carbon Neutral Company
       \n\n\n\n\nBusiness carbon offsetting and carbo...
    
    
      15
       http://www.pewterers.org.uk/the_company/aboutu...
                       The Pewterers' Company
       \n\n\n\n\n\n\n\n\n\n\n \n\n\nHome\n The Compan...
    
    
      16
       http://www.vauxhall.co.uk/about-vauxhall/about...
                                     Vauxhall
       \n\n\n\n\n\n\n\n\n\n\nHomeConfigurator\n\n\n\n...
    
    
      17
                    http://ee.co.uk/our-company/about-ee
                                           EE
        \n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main cont...
    
    
      18
                      http://www.candoco.co.uk/about-us/
                        Candoco Dance Company
        \n\n\n\n\n\n\n\n\n\n\nMenu\n\n\nFollow us on ...
    
    
      19
              http://www.victrex.com/en/company/about-us
                                      Victrex
       \n\n\n\n\n\n\n\n\n\n\n\n\n\nContact Us\n\n\nMy...
    
    
      20
                http://www.ensus.co.uk/Company/About_us/
                                        Ensus
       HomeSitemapImprintLinksContactFull text search...
    
    
      21
                 http://www.anglianwater.co.uk/about-us/
                                Anglian Water
       \n\n\n\n\n\n\n\n\n\n\n\nWe use Google Analytic...
    
    
      22
              http://www.chequeandcredit.co.uk/about_us/
       The Cheque and Credit Clearing Company
       \n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\...
    
    
      23
       http://www.vodafone.co.uk/about-us/company-his...
                                     Vodafone
       \n\n\n\n\n\n\n\n\n\n\n\n\nSkip to navigation\n...
    
    
      24
             http://www.people1sttraining.co.uk/about-us
                                   People 1st
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n0203 074 1212\n\n\...
    
    
      25
                     http://www.starbucks.co.uk/about-us
                                    Starbucks
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nskip to Main N...
    
    
      26
            http://www.merlinentertainments.biz/about-us
                        Merlin Entertainments
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
    
    
      27
          http://www.bloomsbury.com/uk/company/about-us/
                        Bloomsbury Publishing
                                                        
    
    
      28
       http://www.alcatelonetouch.com/global-en/compa...
                            Alcatel One Touch
                Home Products   Smart phones Feature ...
    
    
      29
       http://masonkings.jd-dealer.co.uk/About-us/Our...
                                 Masons Kings
        \n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n\n\n\n\n ...
    
    
      30
                    http://www.oxfordbus.co.uk/about-us/
                           Oxford Bus Company
       \n\n\n\n\n\nOxford Bus Company\n\n\nOur servic...
    
    
      31
                       http://www.patient.co.uk/about-us
                                Patient.co.uk
       Skip to contentPatient.co.uk - Trusted medical...
    
    
      32
             http://www.bootstrapcompany.co.uk/about-us/
                            Bootstrap Company
        \n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n...
    
    
      33
       http://www.fusionfurniturecompany.co.uk/about.php
                             Fusion Furniture
       \n\n\n\n\nsales@fusionfurniturecompany.co.uk\n...
    
    
      34
                   http://www.siemens.co.uk/en/about_us/
                                      Siemens
       \n\n\n\nSkip to Content\n\n\n\n\nSIEMENS\n\n\n...
    
    
      35
       http://www.bosch.co.uk/en/uk/about_bosch_home_...
                                     Bosch UK
       \nERROR\nThe requested URL could not be retrie...
    
    
      36
                  https://www.qualcomm.com/company/about
                                     Qualcomm
        \n\n\n\nSite Map\nProducts\nInvention\nNews\n...
    
    
      37
                            https://www.apple.com/about/
                                        Apple
       \n\n\n\n\n\nMenu\nApple\n\n\n\n\nApple\nStore\...
    
    
      38
       http://www2.mercedes-benz.co.uk/content/united...
                             Mercedes-Benz UK
        Latest offersNew Car offersUsed Car offersQui...
    
    
      39
                           http://www.ibm.com/ibm/uk/en/
                                       IBM UK
       \n\n\n\n\n\n\n\n\n\n\nSelect a country/region:...
    
    
      40
                         https://www.google.co.uk/about/
                                       Google
       \n\n\n\n\n\n\nSkip to content\n\n\nFollow us o...
    
    
      41
       http://www.intel.com/content/www/us/en/company...
                                        Intel
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle...
    
    
      42
                  http://pages.ebay.co.uk/aboutebay.html
                                         ebay
       \n\n\n\nSkip to main contenteBayShop by catego...
    
    
      43
       http://www.webmd.com/about-webmd-policies/abou...
                                        WebMD
       \nAccess Denied\n \nYou don't have permission ...
    
    
      44
                    http://www.growthintel.com/about-us/
                          Growth Intelligence
       \n\n\n\n\n\n\n\n\n\n\n\n\nBoost B2B marketing ...



In [15]:

    
descdf
#print descdf.ix[descdf['CompanyName'] == 'Starbucks', 'CompanyDescription'].values
print descdf.ix[descdf['CompanyName'] == 'Starbucks', 'CompanyDescription'].values[0].encode('utf-8')









    


















skip to Main Navigation
skip to Main Content
skip to Footer

Starbucks Coffee Company



Search this site


Find a Store

Sign In




Navigation




Coffee


Menu


Coffeehouse


Responsibility


Shop


Card















Search












Our Coffees
Whole Bean CoffeeGround CoffeeStarbucks VIA®View All Coffees
Find Your Perfect CoffeeStarbucks Reserve Coffee
Espresso
Origin EspressoThe Best EquipmentThe IngredientsWho makes it
How to Brew Great Coffee
Coffee PressPour-OverIced Pour-OverCoffee Brewer
Ethical Sourcing
CoffeeCoffee QualityFarmer Support
Learn More
New Starbucks DiscoveriesStarbucks High Arabica StandardsLearn About the Starbucks RoastCoffee PassionGreen Coffee ExtractLearn About Coffee FormsThe Clover® Brewing SystemCoffee FAQs


Looking for Coffee Beverages?
Filter CoffeeEspresso BeveragesFrappuccino® Blended Beverages


Kenya Whole bean CoffeeElegant and Sweet




Drinks
Bottled DrinksBrewed TeaChocolate BeveragesEspresso BeveragesFilter CoffeeFrappuccino Blended BeveragesStarbucks RefreshaCold Brew
Food
BreakfastMuffins, Pastries & DoughnutsLunchCakes & CookiesFresh FruitGrab and Go
Nutrition
Beverage and Food Information
Learn More
Menu FAQs


Looking for Coffee at Home?
Whole Bean CoffeesStarbucks VIA® Instant and Microground CoffeeStarbucks Reserve™ Coffee 


Maple MacchiatoA new twist on a classic with delicious Canadian maple syrup.




EntertainmentStarbucks Stores
Starbucks Evenings
Wireless Internet
Powermat Wireless Charging
Coffee Master ProgrammeMobile Applications
Starbucks Android AppStarbucks iPhone App
Online CommunityStore Design
Apprenticeship ProgrammeBarista Championship

Looking for Something Else?
Whole Bean CoffeesDrinksAbout Us


Starbucks Mobile AppsiPhone update now available.




Community
Backing YouthGood NeighboursYouth Action Case StudiesYouth Action 2014
Ethical Sourcing
CoffeeFarmer SupportTeaCocoaStore Products
Environment
Recycling & Reducing WasteEnergyWaterGreen BuildingClimate Change
DiversityLearn More
Starbucks™ Shared Planet™PoliciesCommunity Investments FAQs


Looking for Starbucks Information?
About UsCoffeeNutrition


Community Involvement    For Starbucks – community means two things – being involved in the communities where we are, and backing young people – nationwide     




Coffee
Blonde Roast CoffeeMedium Roast CoffeeDark Roast CoffeeStarbucks Reserve® CoffeeStarbucks VIA®SubscriptionsCoffee Tours
Tea
Black TeasGreen TeasHerbal InfusionsTea Tours
Verismo™
Verismo™ MachinesVerismo™ PodsRegistrationSubscriptions
Merchandise
Mugs & TumblersCold BeveragesCoffee PressesSyrups & Sauces



Drinkware to welcome back spring.Pastel Palettes and Bright Surprises




Buy a Card
Starbucks CardStarbucks Card eGiftCorporate Sales
Manage Your Card
Check BalanceTop-Up Your CardView Transactions
My Starbucks Rewards
Register Your CardView Your StarsRewards Program Ts and Cs
Learn More
Card FAQsMy Starbucks Rewards FAQsManage Your Account
Card Terms and Conditions

Looking for Starbucks Mobile Applications?
Get the Starbucks® app for iPhone® and Android™


My Starbucks RewardsStart earning rewards today.





About Us


It happens millions of times each week – a customer receives a drink from a Starbucks barista – but each interaction is unique. 
It’s just a moment in time – just one hand reaching over the counter to present a cup to another outstretched hand.
But it’s a connection.
We make sure everything we do honors that connection – from our commitment to the highest quality coffee in the world, to the way we engage with our customers and communities to do business responsibly.
From our beginnings as a single store nearly forty years ago, in every place that we’ve been, and every place that we touch, we've tried to make it a little better than we found it.



About Us






About Us











Facebook




Twitter












About Us


 

Our Contribution 

Our Heritage 

Our Company 




For Business


 

Franchised Opportunities 

Real Estate 

Starbucks on the go 




Career Centre


 

Working at Starbucks 

Support Centre Careers 

Retail Careers 

International Careers 




Online Community


 

Facebook 

Twitter 

YouTube 




Quick Links 


My Account 

Store Locator 

Nutrition Info 

Customer Service 

What’s New 











Change Region
English 




Web AccessibilityPrivacy StatementTerms of UseMy Starbucks Rewards FAQsPartnersSite Map



© 2014 Starbucks Corporation. All rights reserved.



In [16]:

    
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktWordTokenizer
#from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer


english_stops = set(stopwords.words('english'))


def tokenizeString(string,lower=True,tokenizer="wordpunct"):
    if tokenizer=="wordpunct":
        tokenized=WordPunctTokenizer().tokenize(string)
        if lower==True:
            tokenized=[w.lower() for w in tokenized]
    if tokenizer=="punktword":
        tokenized=PunktWordTokenizer().tokenize(string)
        if lower==True:
            tokenized=[w.lower() for w in tokenized]
    return tokenized

def cleanVector(tokens,clean=True,stopremove=True,minlen=2):
    output=[]
    disallowedchar=set(["!","?",'"',"'",",",".",":",";"])
    english_stops = set(stopwords.words('english'))
    for i in tokens:
        found=False
        if len(set(i).intersection(disallowedchar))>0:
            found=True
        if found==False and stopremove==False:
            output.append(i)
        if found==False and stopremove==True and minlen==0:
            if i not in english_stops:
                output.append(i)
        if found==False and stopremove==True and minlen>0:
            if i not in english_stops and len(i)>=minlen:
                output.append(i)
    return output

def stemVector(vector,method="lemmatize"):
    output=[]
    if method=='lemmatize':
        wnl = WordNetLemmatizer()
        for i in vector:
            i=wnl.lemmatize(i)
            output.append(i)
    if method=='snowball':
        st=EnglishStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    if method=='porter':
        st=PorterStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    if method=='lancaster':
        st=LancasterStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    return output


def tokeniseCleanStem(inputstring):
    return stemVector(cleanVector(tokenizeString(inputstring)))


os.chdir(datadir)
descpklfile="processeddescriptions.pkl"
descfolderpath=os.path.join(datadir,descpklfile)
if (os.path.exists(descfolderpath)==True):
    print("Pickle file containing preprocessed company data found. Loading it...")
    descdf=pickle.load(open(descfolderpath,'r'))
else:
    print("Cleaning, tokenising and lemmatising company data text...")
    descdf['Tokens'] = descdf['CompanyDescription'].apply(lambda x: tokeniseCleanStem(x))
    with open(descpklfile,'wb') as output:
        pickle.dump(descdf, output, pickle.HIGHEST_PROTOCOL)
os.chdir(rootdir)

descdf

#print descdf['Tokens']









    



Pickle file containing preprocessed company data found. Loading it...






    Out[16]:






  
    
      
      AboutUsURL
      CompanyName
      CompanyDescription
      Tokens
    
  
  
    
      0 
                        http://www.mckinsey.com/about_us
                           McKinsey & Company
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip main navi...
       [skip, main, navigation, client, service, clie...
    
    
      1 
          http://www.thewhitecompany.com/help/our-story/
                            The White Company
       \n\n\n\n\n\n\n\n\n\n\nBedroom\n\n\n\n\nShop Be...
       [bedroom, shop, bedding, view, bed, linen, col...
    
    
      2 
            http://corporate.marksandspencer.com/aboutus
                              Marks & Spencer
       \n\n\n\n\n\n\nmenu\nback\n\nsearch\nstore find...
       [menu, back, search, store, finder, com, store...
    
    
      3 
                       http://www.kidsco.org.uk/about-us
                                 Kids Company
       \n\n\n\n\n\n\n\n\n\nHome\nAbout Us\nOur Work\n...
       [home, u, work, contact, search, usour, philos...
    
    
      4 
         http://www.thunderhead.com/what-we-do/about-us/
                                  Thunderhead
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
       [thunderhead, u, management, team, career, new...
    
    
      5 
         https://www.astonmartin.com/en/company/about-us
                                 Aston Martin
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWe use cookies o...
       [use, cooky, website, continuing, use, website...
    
    
      6 
       http://www.bicestervillage.com/en/company/abou...
                             Bicester Village
       \n\n\n\n\n\n\n\n\n\n\nBicester Village • Chic ...
       [bicester, village, chic, outlet, shopping, ho...
    
    
      7 
       http://www.solarcentury.com/uk/about-solarcent...
                                 Solarcentury
       \n\n\n\n\n\n\n\n\nUnited KingdomUK\nBeneluxNL\...
       [united, kingdomuk, beneluxnl, south, africaza...
    
    
      8 
                      http://www.slc.co.uk/about-us.aspx
                        Student Loans Company
       \n\n\n\n\n\n\n\n\n\nJump to Content [Accesskey...
       [jump, content, accesskey, jump, navigation, a...
    
    
      9 
                       https://stationers.org/about.html
                      The Stationers' Company
        \n\n\n\n\n\n\nHomeHiring Stationers' HallHire...
       [homehiring, stationer, hallhire, informationt...
    
    
      10
                         http://www.rsc.org.uk/about-us/
                    Royal Shakespeare Company
       \n\n\n\n\n\n\n\n\nAccept and Close\n\r\n    We...
       [accept, close, use, cooky, website, using, si...
    
    
      11
             http://www.snellgroup.com/company/about-us/
                                        Snell
       \n\n\n\n\n\n\n\n\n\n\n\nSolutions\n\n\n\nLive ...
       [solution, live, television, sport, production...
    
    
      12
       http://www.waxchandlers.org.uk/about-us/index.php
                    The Wax Chandlers Company
       \n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout us\nThe Wax ...
       [u, wax, chandler, company, one, city, london,...
    
    
      13
       http://www.expeditors.com/our-company/about-us...
                                   Expeditors
       \n\n\n\n\n\n\n\n\n\n\n\nOur Company\n\nAbout U...
       [company, u, philosophy, key, fact, history, l...
    
    
      14
                   http://www.carbonneutral.com/about-us
                   The Carbon Neutral Company
       \n\n\n\n\nBusiness carbon offsetting and carbo...
       [business, carbon, offsetting, carbon, managem...
    
    
      15
       http://www.pewterers.org.uk/the_company/aboutu...
                       The Pewterers' Company
       \n\n\n\n\n\n\n\n\n\n\n \n\n\nHome\n The Compan...
       [home, company, u, master, warden, company, hi...
    
    
      16
       http://www.vauxhall.co.uk/about-vauxhall/about...
                                     Vauxhall
       \n\n\n\n\n\n\n\n\n\n\nHomeConfigurator\n\n\n\n...
       [homeconfigurator, find, retailer, please, ent...
    
    
      17
                    http://ee.co.uk/our-company/about-ee
                                           EE
        \n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main cont...
       [skip, main, content, skip, search, looking, o...
    
    
      18
                      http://www.candoco.co.uk/about-us/
                        Candoco Dance Company
        \n\n\n\n\n\n\n\n\n\n\nMenu\n\n\nFollow us on ...
       [menu, follow, u, twitter, join, u, facebook, ...
    
    
      19
              http://www.victrex.com/en/company/about-us
                                      Victrex
       \n\n\n\n\n\n\n\n\n\n\n\n\n\nContact Us\n\n\nMy...
       [contact, u, victrex, news, event, datasheets,...
    
    
      20
                http://www.ensus.co.uk/Company/About_us/
                                        Ensus
       HomeSitemapImprintLinksContactFull text search...
       [homesitemapimprintlinkscontactfull, text, sea...
    
    
      21
                 http://www.anglianwater.co.uk/about-us/
                                Anglian Water
       \n\n\n\n\n\n\n\n\n\n\n\nWe use Google Analytic...
       [use, google, analytics, cooky, aid, improving...
    
    
      22
              http://www.chequeandcredit.co.uk/about_us/
       The Cheque and Credit Clearing Company
       \n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\...
       [skip, content, print, page, home, u, objectiv...
    
    
      23
       http://www.vodafone.co.uk/about-us/company-his...
                                     Vodafone
       \n\n\n\n\n\n\n\n\n\n\n\n\nSkip to navigation\n...
       [skip, navigation, skip, secondary, navigation...
    
    
      24
             http://www.people1sttraining.co.uk/about-us
                                   People 1st
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n0203 074 1212\n\n\...
       [0203, 074, 1212, enquiry, people1sttraining, ...
    
    
      25
                     http://www.starbucks.co.uk/about-us
                                    Starbucks
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nskip to Main N...
       [skip, main, navigation, skip, main, content, ...
    
    
      26
            http://www.merlinentertainments.biz/about-us
                        Merlin Entertainments
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
       [menu, home, company, u, people, location, par...
    
    
      27
          http://www.bloomsbury.com/uk/company/about-us/
                        Bloomsbury Publishing
                                                        
                                                      []
    
    
      28
       http://www.alcatelonetouch.com/global-en/compa...
                            Alcatel One Touch
                Home Products   Smart phones Feature ...
       [home, product, smart, phone, feature, phone, ...
    
    
      29
       http://masonkings.jd-dealer.co.uk/About-us/Our...
                                 Masons Kings
        \n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n\n\n\n\n ...
       [44, 1626, 852140, 44, 1626, 852140, product, ...
    
    
      30
                    http://www.oxfordbus.co.uk/about-us/
                           Oxford Bus Company
       \n\n\n\n\n\nOxford Bus Company\n\n\nOur servic...
       [oxford, bus, company, service, city, park, ri...
    
    
      31
                       http://www.patient.co.uk/about-us
                                Patient.co.uk
       Skip to contentPatient.co.uk - Trusted medical...
       [skip, contentpatient, co, uk, trusted, medica...
    
    
      32
             http://www.bootstrapcompany.co.uk/about-us/
                            Bootstrap Company
        \n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n...
       [open, close, menu, community, work, skip, con...
    
    
      33
       http://www.fusionfurniturecompany.co.uk/about.php
                             Fusion Furniture
       \n\n\n\n\nsales@fusionfurniturecompany.co.uk\n...
       [sale, fusionfurniturecompany, co, uk, 0800, 0...
    
    
      34
                   http://www.siemens.co.uk/en/about_us/
                                      Siemens
       \n\n\n\nSkip to Content\n\n\n\n\nSIEMENS\n\n\n...
       [skip, content, siemens, u, skip, site, identi...
    
    
      35
       http://www.bosch.co.uk/en/uk/about_bosch_home_...
                                     Bosch UK
       \nERROR\nThe requested URL could not be retrie...
       [error, requested, url, could, retrieved, foll...
    
    
      36
                  https://www.qualcomm.com/company/about
                                     Qualcomm
        \n\n\n\nSite Map\nProducts\nInvention\nNews\n...
       [site, map, product, invention, news, companyc...
    
    
      37
                            https://www.apple.com/about/
                                        Apple
       \n\n\n\n\n\nMenu\nApple\n\n\n\n\nApple\nStore\...
       [menu, apple, apple, store, mac, iphone, watch...
    
    
      38
       http://www2.mercedes-benz.co.uk/content/united...
                             Mercedes-Benz UK
        Latest offersNew Car offersUsed Car offersQui...
       [latest, offersnew, car, offersused, car, offe...
    
    
      39
                           http://www.ibm.com/ibm/uk/en/
                                       IBM UK
       \n\n\n\n\n\n\n\n\n\n\nSelect a country/region:...
       [select, country, region, united, kingdom, ibm...
    
    
      40
                         https://www.google.co.uk/about/
                                       Google
       \n\n\n\n\n\n\nSkip to content\n\n\nFollow us o...
       [skip, content, follow, u, google, google, sea...
    
    
      41
       http://www.intel.com/content/www/us/en/company...
                                        Intel
       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle...
       [toggle, navigation, toggle, search, menu, usa...
    
    
      42
                  http://pages.ebay.co.uk/aboutebay.html
                                         ebay
       \n\n\n\nSkip to main contenteBayShop by catego...
       [skip, main, contentebayshop, categoryenter, s...
    
    
      43
       http://www.webmd.com/about-webmd-policies/abou...
                                        WebMD
       \nAccess Denied\n \nYou don't have permission ...
       [access, denied, permission, access, http, www...
    
    
      44
                    http://www.growthintel.com/about-us/
                          Growth Intelligence
       \n\n\n\n\n\n\n\n\n\n\n\n\nBoost B2B marketing ...
       [boost, b2b, marketing, conversion, rate, smar...



In [17]:

    
from gensim import corpora,models

dictionary = corpora.Dictionary(descdf['Tokens'])
print dictionary
#print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in descdf['Tokens']]
#print(corpus)

tfidfmodel = models.TfidfModel(corpus)
# Apply it to the input corpus
tfidfcorpus = tfidfmodel[corpus]
#print(tfidfcorpus)
dictpath = os.path.join(datadir,'companies.dict')
dictionary.save(dictpath)

corpuspath = os.path.join(datadir,'corpus.mm')
corpora.MmCorpus.serialize(corpuspath, corpus)









    



Dictionary(4862 unique tokens: [u'limited', u'programmemobile', u'magnetic', u'dynamic', u'four']...)



In [21]:

    
import logging

logging.basicConfig(filename='companies.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#id2word = corpora.Dictionary.load_from_text(dictpath)
id2word = dictionary
#mm = corpora.MmCorpus(corpuspath)
mm = tfidfcorpus

lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=2, update_every=1, chunksize=10000, passes=10)
ldapath = os.path.join(datadir,'companies_lda.model')
lda.save(ldapath)

lda.print_topics(10)









    Out[21]:





[u'0.001*vodafone + 0.001*intel + 0.001*tcl + 0.000*cheque + 0.000*ebay + 0.000*patient + 0.000*furniture + 0.000*siemens + 0.000*solar + 0.000*google',
 u'0.001*apple + 0.001*water + 0.000*ibm + 0.000*mckinsey + 0.000*ee + 0.000*merlin + 0.000*starbucks + 0.000*kid + 0.000*victrex + 0.000*qualcomm']



In [22]:

    
from gensim.similarities import Similarity
from gensim import similarities

query = "Electronics appliances"
query = dictionary.doc2bow(tokeniseCleanStem(query))

# Apply the LDA model trained on the corpus to the query
query_lda = lda[query]
print "\nThe similarity of the query with each one of the computed topics is:\n"
print(query_lda)

index = similarities.MatrixSimilarity(lda[tfidfcorpus])

print "\n\nThe similarity of the query to the documents in the corpus is:\n"

sims = index[query_lda] # perform a similarity query against the corpus
resultlist = list(enumerate(sims))
print(resultlist)

print "\n\nThe company which best fits the query by LDA-deduced topics is:\n"
resultlist.sort(key=lambda x: x[1], reverse=True)
result = resultlist[0][0]
print descdf.iloc[result]









    



WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)






    



The similarity of the query with each one of the computed topics is:

[(0, 0.49411810859580702), (1, 0.50588189140419304)]


The similarity of the query to the documents in the corpus is:

[(0, 0.77713782), (1, 0.74038559), (2, 0.76099867), (3, 0.77396727), (4, 0.76290452), (5, 0.74658364), (6, 0.777596), (7, 0.74598134), (8, 0.78152198), (9, 0.75475007), (10, 0.76012248), (11, 0.76894623), (12, 0.76687193), (13, 0.75019664), (14, 0.76371258), (15, 0.76297742), (16, 0.7477091), (17, 0.77451569), (18, 0.77830619), (19, 0.77035463), (20, 0.76058578), (21, 0.79852659), (22, 0.77956843), (23, 0.77771318), (24, 0.76265919), (25, 0.76829535), (26, 0.77843988), (27, 0.99993086), (28, 0.801669), (29, 0.76004219), (30, 0.77450323), (31, 0.7515223), (32, 0.75440323), (33, 0.75473392), (34, 0.75557601), (35, 0.81794977), (36, 0.76628429), (37, 0.80737174), (38, 0.76270026), (39, 0.78740346), (40, 0.75772244), (41, 0.77305758), (42, 0.7703163), (43, 0.81210667), (44, 0.73889565)]


The company which best fits the query by LDA-deduced topics is:

AboutUsURL            http://www.bloomsbury.com/uk/company/about-us/
CompanyName                                    Bloomsbury Publishing
CompanyDescription                                                  
Tokens                                                            []
Name: 27, dtype: object

	CompanyName	CompanyNumber	RegAddress_CareOf	RegAddress_POBox	RegAddress_AddressLine1	RegAddress_AddressLine2	RegAddress_PostTown	RegAddress_County	RegAddress_Country	RegAddress_PostCode	...	PreviousName_6_CONDATE	PreviousName_6_CompanyName	PreviousName_7_CONDATE	PreviousName_7_CompanyName	PreviousName_8_CONDATE	PreviousName_8_CompanyName	PreviousName_9_CONDATE	PreviousName_9_CompanyName	PreviousName_10_CONDATE	PreviousName_10_CompanyName
0	! LTD	08209948	NaN	NaN	METROHOUSE 57 PEPPER ROAD	HUNSLET	LEEDS	YORKSHIRE	NaN	LS10 2RU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	!BIG IMPACT GRAPHICS LIMITED	07382019	NaN	NaN	335 ROSDEN HOUSE	372 OLD STREET	LONDON	NaN	NaN	EC1V 9AV	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	!K7 TOURING LIMITED	08937297	C/O THE GREENE PARTNERSHIP LLP	NaN	10TH FLOOR MAPLE HOUSE	HIGH STREET	POTTERS BAR	HERTFORDSHIRE	NaN	EN6 5BA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	!NFERNO LTD.	04753368	NaN	NaN	FIRST FLOOR THAVIES INN HOUSE 3-4	HOLBORN CIRCUS	LONDON	NaN	NaN	EC1N 2HA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	!NSPIRED LTD	SC421617	NaN	NaN	12 BON ACCORD SQUARE	NaN	ABERDEEN	NaN	NaN	AB11 6DJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	!NVERTD DESIGNS LIMITED	09152972	NaN	NaN	32 RECTORY ROAD	NaN	STEPPINGLEY	NaN	UNITED KINGDOM	MK45 5AT	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	!OBAC LIMITED	FC031362	NaN	NaN	1ST AND 2ND FLOORS ELIZABETH HOUSE	LES RUETIES BRAYES	ST PETER PORT	GY1 1EW	GUERNSEY	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	!OBAC UK LIMITED	07687209	NaN	NaN	ENDEAVOUR HOUSE	COOPERS END ROAD	STANSTED AIRPORT	NaN	NaN	CM24 1SJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	!YOZO FASS LIMITED	02714021	NaN	NaN	1 VERONICA HOUSE	WICKHAM ROAD	BROCKLEY	NaN	NaN	SE4 1NQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	"1 C O LIMITED"	03811958	NaN	NaN	FANE COURT	GREEN ROAD SHIPBOURNE	TONBRIDGE	KENT	NaN	TN11 9PL	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10	"2 ECOUTE" LIMITED	06439541	NaN	NaN	38 PAXTON GARDENS	WOKING	SURREY	NaN	NaN	GU21 5TS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
11	"243 RUGBY ROAD MANAGEMENT COMPANY LIMITED"	05914136	NaN	NaN	45 SUMMER ROW	NaN	BIRMINGHAM	WEST MIDLANDS	NaN	B3 1JJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
12	"309" WEST END LANE MANAGEMENT LIMITED	02943302	NaN	NaN	7 GRANARD BUSINESS CENTRE	BUNNS LANE	LONDON	NaN	ENGLAND	NW7 2DQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13	"A LITTLE BIT DIFFERENT" LIMITED	08878402	NaN	NaN	1 LOCKE STREET	NaN	BARNSLEY	SOUTH YORKSHIRE	NaN	S70 6ND	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
14	"A TASTE OF TUSCANY" LTD	06473722	NaN	NaN	5 ELSTREE GATE, ELSTREE WAY	BOREHAMWOOD	HERTFORDSHIRE	NaN	NaN	WD6 1JD	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15	"A" ADVISORY LLP	OC355684	NaN	NaN	3RD FLOOR	5 LLOYD'S AVENUE	LONDON	NaN	NaN	EC3N 3AE	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16	"A" CERAMICS LIMITED	04494986	NaN	NaN	132 MANCHESTER ROAD	SHAW	OLDHAM	LANCASHIRE	NaN	OL2 7DD	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
17	"A" CONCEPT LIMITED	02537158	NaN	NaN	31 OVAL ROAD	CAMDEN TOWN	LONDON	NaN	NaN	NW1 7EA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
18	"A" TRAFFIC SOLUTION LTD	05852396	NaN	NaN	THE OLD EXCHANGE	234 SOUTHCHURCH ROAD	SOUTHEND-ON-SEA	ESSEX	NaN	SS1 2EG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19	"A-Z-ENGINEERS LONDON" LTD	09025577	NaN	NaN	203 LONDON ROAD 203 LONDON ROAD	FLAT 4	LONDON,MITCHAM	NaN	UNITED KINGDOM	CR4 2JD	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20	"A.B.J.Z DRIVER HIRE " LTD	09151627	NaN	NaN	32A WOLSDON STREET	WOLSDON STREET	PLYMOUTH	NaN	UNITED KINGDOM	PL1 5EH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
21	"A.K.WELDING SERVICE" LTD	08981806	NaN	NaN	15 TEAGUES CRESCENT	TRENCH	TELFORD	SHROPSHIRE	UNITED KINGDOM	TF2 6RQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
22	"AA LOGISTIKS" LIMITED	09478701	NaN	NaN	25 CULVERT ROAD	NaN	LONDON	NaN	UNITED KINGDOM	N15 5HF	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
23	"AGAD" ADVERTISING CORPORATION LTD.	09465805	NaN	NaN	131 OATLANDS DRIVE	NaN	SLOUGH	NaN	UNITED KINGDOM	SL1 3HN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
24	"AJA PROPERTY DEVELOPMENT LTD"	05651002	NaN	NaN	10 WOODBERRY AVENUE	HARROW	MIDDLESEX	NaN	NaN	HA2 6AU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25	"ALL WRAPPED UP" EVENTS MANAGEMENT LTD	SC313991	NaN	NaN	5 MELVILLE CRESCENT	NaN	EDINBURGH	NaN	NaN	EH3 7JA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
26	"ALTAI CASHMERE" LLC	FC027187	NaN	NaN	BUILDING LEFT TO THE "TUUL"	DRY CLEANING	KHAN-UUL DISTRICT	ULAANBAATAR, MONGOLIA	MONGOLIA	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
27	"AND BREATHE" LTD	09008930	NaN	NaN	20 HORNCASTLE ROAD	NaN	BOSTON	NaN	UNITED KINGDOM	PE21 9BU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
28	"APOLLO'S BAR" LIMITED	09044937	C/O	NaN	4 TAKELY RIDE	NaN	BASILDON	ESSEX	ENGLAND	SS16 5BE	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
29	"ARTHUR BALFOUR",CONSERVATIVE WORKING MEN'S CL...	IP10067R	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
211048	\ COMPANY LTD	05060411	NaN	NaN	WWW.BUY-THIS-COMPANY-NAME.COM	SUITE B, 29 HARLEY STREET	LONDON	LONDON	NaN	W1G 9QR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211049	\TOOLTRAC\ LTD	06465593	NaN	NaN	DEPT 302 43 OWSTON ROAD	CARCROFT	DONCASTER	SOUTH YORKSHIRE	NaN	DN6 8DA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211050	]PERFORMANCE S P A C E [ STUDIOS C.I.C.	09138062	NaN	NaN	SWAN WHARF	60A DACE ROAD	LONDON	NaN	NaN	E3 2NQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211051	]PS[ STUDIOS LTD	08046314	NaN	NaN	UNIT 7 ENCLAVE	50 RESOLUTION WAY	LONDON	NaN	UNITED KINGDOM	SE8 4AL	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211052	_XURBIA_XENDLESS LIMITED	06312240	NaN	NaN	41 GREAT PORTLAND STREET	NaN	LONDON	NaN	NaN	W1W 7LA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211053	`AT YOUR SERVICE` (WALES) LIMITED	05658675	NaN	NaN	UNIT H/I LONLAS VILLAGE WORKSHOPS	LONLAS BUSINESS PARK	SKEWEN	NEATH	NaN	SA10 6RR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211054	`DESIGNBLU´ LIMITED	06904076	NaN	NaN	CORNERSTONE HOUSE MIDLAND WAY	THORNBURY	BRISTOL	NaN	NaN	BS35 2BS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211055	`THE JERUSALEM ARTS TRUST FOR WALES` - CANOLFA...	06585960	NaN	NaN	SALEM BAPTIST CHAPEL	BELL BANK	HAY-ON-WYE	NaN	NaN	HR3 5AE	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211056	{BOBA HEADS} LTD	09470943	NaN	NaN	10 BOWBROOK GRANGE	NaN	SHREWSBURY	SHROPSHIRE	UNITED KINGDOM	SY3 8XT	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211057	£ LIMITED	09471545	NaN	NaN	DEPT 2 43 OWSTON ROAD	CARCROFT	DONCASTER	SOUTH YORKSHIRE	UNITED KINGDOM	DN6 8DA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211058	£1 BAGUETTES & PIES LIMITED	08458719	NaN	NaN	UNIT 62 LONGTON EXCHANGE	LONGTON	STOKE-ON-TRENT	STAFFS	NaN	ST3 2JA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211059	£1 SANDWICH COMPANY LIMITED	09254601	NaN	NaN	4 PEMBROKE COURT	NaN	NEWCASTLE UPON TYNE	NaN	ENGLAND	NE3 2YT	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211060	£1 STORE LTD	07489976	NaN	NaN	26 PARVILLS	PARVILLS	WALTHAM ABBEY	ESSEX	ENGLAND	EN9 1QG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211061	£10 RECORDS LIMITED	SC422612	NaN	NaN	4 SAUCHENHALL PATH	MOODIESBURN	GLASGOW	NORTH LANARKSHIRE	NaN	G69 0NS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211062	£1K CARS LTD	07607359	PRUDHOE AUTOCARE	NaN	THE OLD CO-OP BUILDINGS	TYNE VIEW TERRACE	PRUDHOE	NORTHUMBERLAND	UNITED KINGDOM	NE42 5PX	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211063	£ASY AS 123 LIMITED	09197224	NaN	NaN	5 DESPARD ROAD	EASTERN GREEN	COVENTRY	WEST MIDLANDS	UNITED KINGDOM	CV5 7DG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211064	£CHESHAM ESTATES LIMITED	04968129	NaN	NaN	DEVONSHIRE HOUSE	60 GOSWELL ROAD	LONDON	NaN	NaN	EC1M 7AD	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211065	£DUCASHION LTD	07625283	NaN	NaN	3 MAYHILL ROAD	NaN	GREENWICH	GREATER LONDON	NaN	SE7 7JG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211066	£££ LTD	08344093	NaN	NaN	METRO HOUSE 57 PEPPER ROAD	HUNSLET	LEEDS	YORKSHIRE	NaN	LS10 2RU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211067	£££ SAVE TYRES LTD	09475149	NaN	NaN	UNIT 1	BALME ROAD	CLECKHEATON	WEST YORKSHIRE	UNITED KINGDOM	BD19 4EW	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211068	¥IVA LTD	09460559	NaN	NaN	6 SHIREHALL PARK	NaN	HENDON	NaN	UNITED KINGDOM	NW4 2QL	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211069	ÁLPHA MASCHIO LTD	09468796	NaN	NaN	28 LEONARD ROAD	FOREST GATE	LONDON	NaN	ENGLAND	E7 0DB	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211070	ÉLAN INTERNATIONAL LTD	09554305	NaN	NaN	71-75 SHELTON STREET	NaN	LONDON	NaN	UNITED KINGDOM	WC2H 9JQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211071	ÉLAN PROJECTS CONSULTANCY LTD.	09481566	NaN	NaN	31 BARDOLPH STREET	NaN	LEICESTER	NaN	UNITED KINGDOM	LE4 6EH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211072	ÉTOILE CONSULTANCY LTD	09498296	NaN	NaN	414-416 BLACKPOOL ROAD	ASHTON-ON-RIBBLE	PRESTON	LANCS	UNITED KINGDOM	PR2 2DX	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211073	Ó MUIRIGH SOLICITORS LIMITED	NI629669	NaN	NaN	24-26 SPRINGFIELD ROAD	NaN	BELFAST	NaN	NaN	BT12 7AG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211074	‘OW ‘BOUT ME? LIMITED	09495226	NaN	NaN	5 DUCKETTS WHARF	SOUTH STREET	BISHOP'S STORTFORD	HERTFORDSHIRE	UNITED KINGDOM	CM23 3AR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211075	“SAIL IN GREECE ADVENTURES” LTD	09511422	NaN	NaN	INTERNATIONAL HOUSE	24 HOLBORN VIADUCT	LONDON	NaN	ENGLAND	EC1A 2BN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211076	€URO IMPORTS LIMITED	08182582	NaN	NaN	PONDSIDE MILL LANE	INSKIP	PRESTON	NaN	NaN	PR4 0TP	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
211077	€UROTECH LTD	06642625	NaN	NaN	REED HOUSE 16 HIGH STREET	WEST WRATTING	CAMBRIDGE	CAMBRIDGESHIRE	NaN	CB21 5LU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	DissolutionDate	Accounts_AccountRefDay	Accounts_AccountRefMonth	Mortgages_NumMortCharges	Mortgages_NumMortOutstanding	Mortgages_NumMortPartSatisfied	Mortgages_NumMortSatisfied	LimitedPartnerships_NumGenPartners	LimitedPartnerships_NumLimPartners
count	0	3565670.000000	3565670.000000	3611077.000000	3611077.000000	3611077.000000	3611077.000000	3611077.000000	3611077.000000
mean	NaN	30.261290	6.351142	0.715750	0.446035	0.000648	0.268702	0.009763	0.032917
std	NaN	2.639808	3.621669	9.078691	7.347536	0.072631	4.631352	0.142520	1.336482
min	NaN	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-5.000000
25%	NaN	30.000000	3.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	NaN	31.000000	6.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	NaN	31.000000	10.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	NaN	31.000000	12.000000	6121.000000	6121.000000	81.000000	5720.000000	110.000000	823.000000

	CompanyName	CompanyNumber	RegAddress_CareOf	RegAddress_POBox	RegAddress_AddressLine1	RegAddress_AddressLine2	RegAddress_PostTown	RegAddress_County	RegAddress_Country	RegAddress_PostCode	...	PreviousName_6_CONDATE	PreviousName_6_CompanyName	PreviousName_7_CONDATE	PreviousName_7_CompanyName	PreviousName_8_CONDATE	PreviousName_8_CompanyName	PreviousName_9_CONDATE	PreviousName_9_CompanyName	PreviousName_10_CONDATE	PreviousName_10_CompanyName
175175	EARO ESTATES LTD	09480942	NaN	NaN	4 MELTON ROAD	NaN	MANCHESTER	NaN	ENGLAND	M8 4HG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
207052	AMERICAN EXPRESSO LIMITED	04255922	NaN	NaN	SUITE 100	THE STUDIO ST NICHOLAS CLOSE	ELSTREE	HERTFORDSHIRE	NaN	WD6 3EW	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
390136	SIXTEEN25	08230187	NaN	NaN	10 DENMARK STREET	NaN	LONDON	NaN	NaN	WC2H 8LS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
38519	DATASAFE SERVICES LIMITED	01999773	NaN	NaN	UNITED HOUSE	NORTH ROAD	LONDON	NaN	NaN	N7 9DP	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
354624	BALLYNARIS LTD	NI621115	NaN	NaN	55 MILEBUSH ROAD	NaN	DROMORE	CO. DOWN	NaN	BT25 1RU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
533389	SUBSEA INSPECTION SW LTD	09156040	NaN	NaN	TIMBERLY	SOUTH STREET	AXMINSTER	DEVON	UNITED KINGDOM	EX13 5AD	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
446674	SOUTHWOOD BUILDING CONTRACTORS LIMITED	06389143	NaN	NaN	UNIT 16 ENDEAVOUR BUSINESS PARK	CROW ARCH LANE	RINGWOOD	HAMPSHIRE	NaN	BH24 1PN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
471345	MISS B CREATIVE LIMITED	07248291	NaN	NaN	470 HUCKNALL ROAD	NaN	NOTTINGHAM	NOTTINGHAMSHIRE	NaN	NG5 1FX	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
249166	S D C CONSULTANCY SERVICES LIMITED	06767060	NaN	NaN	39 WHITE CLOVER SQUARE	NaN	LYMM	CHESHIRE	NaN	WA13 0RX	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
245418	APPLECORE DEVELOPMENTS LIMITED	04183144	NaN	NaN	PORTWAY HILL HOUSE	PORTWAY HILL	BATCOMBE	SOMERSET	NaN	BA4 6BR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
137301	KMJ PROPERTY (TUNBRIDGE WELLS) LIMITED	06915745	NaN	NaN	BUCKINGHAM HOUSE	MYRTLE LANE	BILLINGSHURST	WEST SUSSEX	NaN	RH14 9SG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8488	PRC PLUMBING AND HEATING LIMITED	08310381	CERTAX ACCOUNTING	NaN	ALCESTER BUSINESS CENTRE	KINWARTON FARM ROAD	ALCESTER	WARWICKSHIRE	NaN	B49 6EL	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
834199	UMSI LIMITED	FC031028	NaN	NaN	BRANCH REGISTRATION	REFER TO PARENT REGISTRY	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
332724	AZA-MAB LTD	09439314	NaN	NaN	88 COMMERCIAL WAY	NaN	PECKHAM	NaN	UNITED KINGDOM	SE15 5GG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
647771	HEXAGON ASSOCIATES LIMITED	NI627785	NaN	NaN	UNIT 1	22-218 UPPER NEWTOWNARDS ROAD	BELFAST	NaN	NORTHERN IRELAND	BT4 3ET	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
270759	ARROW GLOBAL MASSEY LIMITED	08612076	NaN	NaN	20-22 BEDFORD ROW	NaN	LONDON	NaN	NaN	WC1R 4JS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
479413	GILESWOODFORD LIMITED	05243089	NaN	NaN	UNITY CHAMBERS	34 HIGH EAST STREET	DORCHESTER	DOREST	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
658126	HIGHWOOD HOUSE PUBLISHING LTD	05112696	NaN	NaN	HIGHWOOD HOUSE	WINTERS LANE REDHILL	BRISTOL	NaN	NaN	BS40 5SH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
51107	DAZZITTO PHOTOGRAPHY LIMITED	08425973	NaN	NaN	50 ERNEST ROAD	NaN	CHATHAM	KENT	NaN	ME4 5PT	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
181148	EASTHOLME COURT MANAGEMENT LIMITED	02198033	NaN	NaN	7 COTTONS MEADOW	KINGSTONE	HEREFORDSHIRE	NaN	NaN	HR2 9EW	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
221642	LIBERATION MEDIA LTD	07955916	NaN	NaN	32 DAM STREET	NaN	LICHFIELD	STAFFORDSHIRE	NaN	WS13 6AA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
61971	A M AGENCIES LTD	NI071258	NaN	NaN	UNIT B6 CLARA HOUSE	DUNMURRY OFFICE PARKUPPER DUNMURRY LANE DUNMURRY	BELFAST	ANTRIM	NaN	BT17 0AJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
308502	FAA INSTALLATIONS LIMITED	08070586	NaN	NaN	8 HARVEST CLOSE	NaN	PONTEFRACT	WEST YORKSHIRE	NaN	WF8 2UR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
743155	COFFEE CULTURE CAFE & EATERY LIMITED	09346166	NaN	NaN	EMERSON HOUSE	HEYES LANE	ALDERLEY EDGE	NaN	UNITED KINGDOM	SK9 7LF	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
122930	ADERYN BUILDING CONSULTANCY LIMITED	09127060	NaN	NaN	RADNOR HOUSE GREENWOOD CLOSE	CARDIFF GATE BUSINESS PARK	CARDIFF	NaN	UNITED KINGDOM	CF23 8AA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
555312	BUZZWORKS HOTELS LIMITED	SC171299	NaN	NaN	132 MAIN STREET	NaN	PRESTWICK	AYRSHIRE	NaN	KA9 1PB	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
48011	JORDAN-HOWELL LTD	08942110	NaN	NaN	3 SOWOOD GARDENS	NaN	OSSETT	WEST YORKSHIRE	NaN	WF5 0SP	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
527074	BROOMCO (3113) LIMITED	NF003379	NaN	NaN	UNIT 9	GORTRUSH INDUSTRIAL ESTATE	OMAGH	CO TYRONE	NaN	BT78 5EJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
127133	DOREL (EUROPE) LTD.	03278346	NaN	NaN	1168/1170 MELTON ROAD	SYSTON	LEICESTER	LEICESTERSHIRE	NaN	LE7 2HB	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
665016	OLD SPOT BREWERY LIMITED	05384602	NaN	NaN	MANOR FARM STATION ROAD	CULLINGWORTH	BRADFORD	NaN	NaN	BD13 5HN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
623591	TERRAHOUSE MANANGEMENT LIMITED	09201714	NaN	NaN	48 BALDRY GARDENS	STREATHAM COMMON	LONDON	NaN	UNITED KINGDOM	SW16 3DJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
177950	EAST LINCS SMART REPAIR LTD	08016004	NaN	NaN	15 LINDEN WAY	NaN	BOSTON	NaN	NaN	PE21 9DY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
21255	25 SPENCER ROAD RTM COMPANY LTD	07165614	NaN	NaN	104 ALEXANDRIA ROAD	NaN	SIDMOUTH	DEVON	NaN	EX10 9HG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
624559	CASTLEFORD WATER TREATMENT LLP	OC352830	NaN	NaN	RUSSELL HOUSE	140 HIGH STREET	EDGWARE	MIDDLESEX	NaN	HA8 7LW	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
657174	ODDC LIMITED	09355223	NaN	NaN	SUITE 34, NEW HOUSE	67-68 HATTON GARDEN	LONDON	NaN	UNITED KINGDOM	EC1N 8JY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
313028	FAIRFIELD PROPERTY MANAGEMENT SERVICES (NE) LTD	08444232	NaN	NaN	FREDERICK HOUSE DEAN GROUP BUSINESS PARK	BRENDA ROAD	HARTLEPOOL	NaN	NaN	TS25 2BW	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
21130	PRIDE SEA EDUCATION & CONSULTING LIMITED	07783359	NaN	NaN	48 WALTHALL STREET	NaN	CREWE	NaN	NaN	CW2 7LA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
167182	WRITTLE ROAD (CHELMSFORD) RESIDENTS ASSOCIATIO...	04046956	NaN	NaN	UNIT 1B	LITTLE HYDE FARM LITTLE HYDE	LANE, INGATESTONE	ESSEX	NaN	CM4 0DU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
454276	SPECIALIST KITCHEN FITTERS LIMITED	09487433	NaN	NaN	73 ADDISON ROAD	NaN	TUNBRIDGE WELLS	KENT	UNITED KINGDOM	TN2 3GG	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
47226	JONNY DECKER LIMITED	07941935	NaN	NaN	29B MONTAGUE ROAD	NaN	LONDON	NaN	NaN	E8 2HN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
366302	FLEET MATTERS LTD	05583300	NaN	NaN	3 WOOD END CLOSE, FARNHAM COMMON	SLOUGH	BUCKINGHAMSHIRE	NaN	NaN	SL2 3RF	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
134560	RED EYE CONSULTING LIMITED	07983909	NaN	NaN	2 CHALFONT SQUARE	OLD FOUNDRY ROAD	IPSWICH	NaN	NaN	IP4 2AJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
496464	MONTES LP	SL007667	NaN	NaN	50 LOTHIAN ROAD	FESTIVAL SQUARE	EDINBURGH	NaN	NaN	EH3 9WJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
314358	SEACONTRACTORS MARITIME PERSONNEL (UK) LTD	05366317	NaN	NaN	WELLINGTON HOUSE FALCON COURT	PRESTON FARM INDUSTRIAL ESTATE	STOCKTON-ON-TEES	CLEVELAND	NaN	TS18 3TS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
727389	ICENIUM LTD	09037652	NaN	NaN	5 WESTFIELD ROAD	REGENTS PARK	SOUTHAMPTON	HAMPSHIRE	UNITED KINGDOM	SO15 4HQ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
729811	THROUGH LIFE SUPPORT LIMITED	07845099	NaN	NaN	CENTRE GATE	COLSTON AVENUE	BRISTOL	NaN	NaN	BS1 4TR	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
649371	THE COOLER WATER COMPANY LIMITED	05777329	NaN	NaN	THE BLUE FARMHOUSE	86-90 CUMBERLAND STREET	WOODBRIDGE	SUFFOLK	NaN	IP12 4AE	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
49902	VOB ENTERPRISE LTD	09131667	NaN	NaN	9 BROMLEY RD	CATFORD	LONDON	NaN	UNITED KINGDOM	SE6 2TS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
541127	BULAU GENERAL CONSTRUCTION LIMITED	09293723	NaN	NaN	25 FRONTFIELD CRESCENT	NaN	PLYMOUTH	NaN	UNITED KINGDOM	PL6 6RY	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
261446	ARETHA INTERNATIONAL LTD	05800646	NaN	NaN	2 HARLESTON CLOSE	LONDON	NaN	NaN	NaN	E5 9NH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
783723	INTERNATIONAL CORRESPONDENCE SCHOOLS LIMITED	SC434382	NaN	NaN	BRECKENRIDGE HOUSE	274 SAUCHIEHALL STREET	GLASGOW	NaN	NaN	G2 3EH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
386823	SIR JOHN SOANE'S MUSEUM TRUST	07965957	NaN	NaN	13 LINCOLN'S INN FIELDS	NaN	LONDON	NaN	NaN	WC2A 3BP	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
226889	ELMWOOD RESIDENTIAL HOME LIMITED	01665156	NaN	NaN	ELMWOOD	COLYFORD	COLYTON	DEVON	NaN	EX24 6QJ	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
725314	CLIMATE ACTION WEST COMMUNITY INTEREST COMPANY	06568552	NaN	NaN	GREAT BOW WHARF	BOW STREET	LANGPORT	SOMERSET	NaN	TA10 9PN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
86855	R C & T LIMITED	09517358	NaN	NaN	32 LAWTON HALL ESTATE	BULWELL HALL ESTATE	BULWELL	NOTTINGHAMSHIRE	ENGLAND	NG6 8BL	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
132853	ADVICE WISE SOLICITORS LTD	08058295	NaN	NaN	OLYMPIC HOUSE, 3RD FLOOR	28-42 CLEMENTS ROAD	ILFORD	ESSEX	ENGLAND	IG1 1BA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
847039	UPPER BIRNIE FARMS (1987)	SL001632	NaN	NaN	UPPER BIRNIE	BENHOLM	KINCARDINESHIRE	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
703525	THE STAFFORD OUTDOOR COMPANY LIMITED	04761613	NaN	NaN	45 MILL STREET	NaN	STAFFORD	STAFFS	ENGLAND	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
633482	CC INTERNATIONAL LTD	09566910	NaN	NaN	20-22 WENLOCK ROAD	NaN	LONDON	NaN	ENGLAND	N1 7GU	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
565516	SYCAMORE WOODS LTD	09456012	NaN	NaN	CRAVEN HOUSE , GROUND FLOOR 40-44 UXBRIDGE ROAD	EALING	LONDON	NaN	ENGLAND	W5 2BS	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	AboutUsURL	CompanyName
0	http://www.mckinsey.com/about_us	McKinsey & Company
1	http://www.thewhitecompany.com/help/our-story/	The White Company
2	http://corporate.marksandspencer.com/aboutus	Marks & Spencer
3	http://www.kidsco.org.uk/about-us	Kids Company
4	http://www.thunderhead.com/what-we-do/about-us/	Thunderhead
5	https://www.astonmartin.com/en/company/about-us	Aston Martin
6	http://www.bicestervillage.com/en/company/abou...	Bicester Village
7	http://www.solarcentury.com/uk/about-solarcent...	Solarcentury
8	http://www.slc.co.uk/about-us.aspx	Student Loans Company
9	https://stationers.org/about.html	The Stationers' Company
10	http://www.rsc.org.uk/about-us/	Royal Shakespeare Company
11	http://www.snellgroup.com/company/about-us/	Snell
12	http://www.waxchandlers.org.uk/about-us/index.php	The Wax Chandlers Company
13	http://www.expeditors.com/our-company/about-us...	Expeditors
14	http://www.carbonneutral.com/about-us	The Carbon Neutral Company
15	http://www.pewterers.org.uk/the_company/aboutu...	The Pewterers' Company
16	http://www.vauxhall.co.uk/about-vauxhall/about...	Vauxhall
17	http://ee.co.uk/our-company/about-ee	EE
18	http://www.candoco.co.uk/about-us/	Candoco Dance Company
19	http://www.victrex.com/en/company/about-us	Victrex
20	http://www.ensus.co.uk/Company/About_us/	Ensus
21	http://www.anglianwater.co.uk/about-us/	Anglian Water
22	http://www.chequeandcredit.co.uk/about_us/	The Cheque and Credit Clearing Company
23	http://www.vodafone.co.uk/about-us/company-his...	Vodafone
24	http://www.people1sttraining.co.uk/about-us	People 1st
25	http://www.starbucks.co.uk/about-us	Starbucks
26	http://www.merlinentertainments.biz/about-us	Merlin Entertainments
27	http://www.bloomsbury.com/uk/company/about-us/	Bloomsbury Publishing
28	http://www.alcatelonetouch.com/global-en/compa...	Alcatel One Touch
29	http://masonkings.jd-dealer.co.uk/About-us/Our...	Masons Kings
30	http://www.oxfordbus.co.uk/about-us/	Oxford Bus Company
31	http://www.patient.co.uk/about-us	Patient.co.uk
32	http://www.bootstrapcompany.co.uk/about-us/	Bootstrap Company
33	http://www.fusionfurniturecompany.co.uk/about.php	Fusion Furniture
34	http://www.siemens.co.uk/en/about_us/	Siemens
35	http://www.bosch.co.uk/en/uk/about_bosch_home_...	Bosch UK
36	https://www.qualcomm.com/company/about	Qualcomm
37	https://www.apple.com/about/	Apple
38	http://www2.mercedes-benz.co.uk/content/united...	Mercedes-Benz UK
39	http://www.ibm.com/ibm/uk/en/	IBM UK
40	https://www.google.co.uk/about/	Google
41	http://www.intel.com/content/www/us/en/company...	Intel
42	http://pages.ebay.co.uk/aboutebay.html	ebay
43	http://www.webmd.com/about-webmd-policies/abou...	WebMD
44	http://www.growthintel.com/about-us/	Growth Intelligence

	AboutUsURL	CompanyName	CompanyDescription
0	http://www.mckinsey.com/about_us	McKinsey & Company	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip main navi...
1	http://www.thewhitecompany.com/help/our-story/	The White Company	\n\n\n\n\n\n\n\n\n\n\nBedroom\n\n\n\n\nShop Be...
2	http://corporate.marksandspencer.com/aboutus	Marks & Spencer	\n\n\n\n\n\n\nmenu\nback\n\nsearch\nstore find...
3	http://www.kidsco.org.uk/about-us	Kids Company	\n\n\n\n\n\n\n\n\n\nHome\nAbout Us\nOur Work\n...
4	http://www.thunderhead.com/what-we-do/about-us/	Thunderhead	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
5	https://www.astonmartin.com/en/company/about-us	Aston Martin	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWe use cookies o...
6	http://www.bicestervillage.com/en/company/abou...	Bicester Village	\n\n\n\n\n\n\n\n\n\n\nBicester Village • Chic ...
7	http://www.solarcentury.com/uk/about-solarcent...	Solarcentury	\n\n\n\n\n\n\n\n\nUnited KingdomUK\nBeneluxNL\...
8	http://www.slc.co.uk/about-us.aspx	Student Loans Company	\n\n\n\n\n\n\n\n\n\nJump to Content [Accesskey...
9	https://stationers.org/about.html	The Stationers' Company	\n\n\n\n\n\n\nHomeHiring Stationers' HallHire...
10	http://www.rsc.org.uk/about-us/	Royal Shakespeare Company	\n\n\n\n\n\n\n\n\nAccept and Close\n\r\n We...
11	http://www.snellgroup.com/company/about-us/	Snell	\n\n\n\n\n\n\n\n\n\n\n\nSolutions\n\n\n\nLive ...
12	http://www.waxchandlers.org.uk/about-us/index.php	The Wax Chandlers Company	\n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout us\nThe Wax ...
13	http://www.expeditors.com/our-company/about-us...	Expeditors	\n\n\n\n\n\n\n\n\n\n\n\nOur Company\n\nAbout U...
14	http://www.carbonneutral.com/about-us	The Carbon Neutral Company	\n\n\n\n\nBusiness carbon offsetting and carbo...
15	http://www.pewterers.org.uk/the_company/aboutu...	The Pewterers' Company	\n\n\n\n\n\n\n\n\n\n\n \n\n\nHome\n The Compan...
16	http://www.vauxhall.co.uk/about-vauxhall/about...	Vauxhall	\n\n\n\n\n\n\n\n\n\n\nHomeConfigurator\n\n\n\n...
17	http://ee.co.uk/our-company/about-ee	EE	\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main cont...
18	http://www.candoco.co.uk/about-us/	Candoco Dance Company	\n\n\n\n\n\n\n\n\n\n\nMenu\n\n\nFollow us on ...
19	http://www.victrex.com/en/company/about-us	Victrex	\n\n\n\n\n\n\n\n\n\n\n\n\n\nContact Us\n\n\nMy...
20	http://www.ensus.co.uk/Company/About_us/	Ensus	HomeSitemapImprintLinksContactFull text search...
21	http://www.anglianwater.co.uk/about-us/	Anglian Water	\n\n\n\n\n\n\n\n\n\n\n\nWe use Google Analytic...
22	http://www.chequeandcredit.co.uk/about_us/	The Cheque and Credit Clearing Company	\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\...
23	http://www.vodafone.co.uk/about-us/company-his...	Vodafone	\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to navigation\n...
24	http://www.people1sttraining.co.uk/about-us	People 1st	\n\n\n\n\n\n\n\n\n\n\n\n\n\n0203 074 1212\n\n\...
25	http://www.starbucks.co.uk/about-us	Starbucks	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nskip to Main N...
26	http://www.merlinentertainments.biz/about-us	Merlin Entertainments	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
27	http://www.bloomsbury.com/uk/company/about-us/	Bloomsbury Publishing
28	http://www.alcatelonetouch.com/global-en/compa...	Alcatel One Touch	Home Products Smart phones Feature ...
29	http://masonkings.jd-dealer.co.uk/About-us/Our...	Masons Kings	\n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n\n\n\n\n ...
30	http://www.oxfordbus.co.uk/about-us/	Oxford Bus Company	\n\n\n\n\n\nOxford Bus Company\n\n\nOur servic...
31	http://www.patient.co.uk/about-us	Patient.co.uk	Skip to contentPatient.co.uk - Trusted medical...
32	http://www.bootstrapcompany.co.uk/about-us/	Bootstrap Company	\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n...
33	http://www.fusionfurniturecompany.co.uk/about.php	Fusion Furniture	\n\n\n\n\nsales@fusionfurniturecompany.co.uk\n...
34	http://www.siemens.co.uk/en/about_us/	Siemens	\n\n\n\nSkip to Content\n\n\n\n\nSIEMENS\n\n\n...
35	http://www.bosch.co.uk/en/uk/about_bosch_home_...	Bosch UK	\nERROR\nThe requested URL could not be retrie...
36	https://www.qualcomm.com/company/about	Qualcomm	\n\n\n\nSite Map\nProducts\nInvention\nNews\n...
37	https://www.apple.com/about/	Apple	\n\n\n\n\n\nMenu\nApple\n\n\n\n\nApple\nStore\...
38	http://www2.mercedes-benz.co.uk/content/united...	Mercedes-Benz UK	Latest offersNew Car offersUsed Car offersQui...
39	http://www.ibm.com/ibm/uk/en/	IBM UK	\n\n\n\n\n\n\n\n\n\n\nSelect a country/region:...
40	https://www.google.co.uk/about/	Google	\n\n\n\n\n\n\nSkip to content\n\n\nFollow us o...
41	http://www.intel.com/content/www/us/en/company...	Intel	\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle...
42	http://pages.ebay.co.uk/aboutebay.html	ebay	\n\n\n\nSkip to main contenteBayShop by catego...
43	http://www.webmd.com/about-webmd-policies/abou...	WebMD	\nAccess Denied\n \nYou don't have permission ...
44	http://www.growthintel.com/about-us/	Growth Intelligence	\n\n\n\n\n\n\n\n\n\n\n\n\nBoost B2B marketing ...