Scripts for automating data collection from Web of Science


In [ ]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

Description

The scripts are divided into 5 functions:

  1. wos_login: This function is called only once to login into WOS using a registered user email address and password
  2. keyword_search: This function is used to add all given keywords to the search bar
  3. select_year: This script is written in such a way that only results from 1 year can be downloaded. This function adds the given year to the search bar and gets the results
  4. download_page: This script downloads the metadata of the papers of a specified page in the results. The data is downloaded in the specified format
  5. webofscience: This is a wrapping function for the script and calls all the above functions sequentially and downloads the results. We need to specify your email address and password to login to WOS (both strings), the keywords you want to search (list of strings), the year you are interested in (integer), the pages you want to download (list of integers),file_format (string)

How to use?

  1. Install Selenium if not done so already
  2. Run the cells containing the above functions
  3. In the last cell of this notebook, modify the username, password, keywords, year, pages and file_format variables for your search. Run the cell to download results

Scripts


In [ ]:
def wos_login(driver,username,password):
    """
    Function to login into WOS
    """
    #Input Username
    username_textbox = driver.find_element_by_name('username')
    username_textbox.send_keys(username)
    #Input Password
    password_textbox = driver.find_element_by_name('password')
    password_textbox.send_keys(password)
    #Click on Sign In
    submit_button = driver.find_element_by_name('image').click()
    return

In [ ]:
def keyword_search(driver,keywords):
    """
    Function to input given keywords to search bar  //*[@id="select2-value(bool_1_2)-result-mnjh-OR"]
    """
    a = 14;
    for i in range(len(keywords)):
        #Input keyword
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="value(input%d)"]'%(i+1))))
        driver.find_element_by_xpath('//*[@id="value(input%d)"]'%(i+1)).send_keys(keywords[i])
        #Add new Search Row
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="addSearchRow%d"]/span[1]'%(i+1))))
        driver.find_element_by_xpath('//*[@id="addSearchRow%d"]/span[1]'%(i+1)).click()
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="select2-value(bool_%d_%d)-container"]'%(i+1,i+2))))
        driver.find_element_by_xpath('//*[@id="select2-value(bool_%d_%d)-container"]'%(i+1,i+2)).click()
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="select2-value(bool_%d_%d)-result-7put-OR"]'%(i+1,i+2))))
        driver.find_element_by_xpath('//*[@id="select2-value(bool_%d_%d)-result-7put-OR"]'%(i+1,i+2)).click()
        a=a+5
    return

In [ ]:
def select_year(driver,year):
    """
    Function to select year
    """
    driver.find_element_by_xpath('//*[@id="periodRange"]').click()
    a1=101
    a2=18
    b=year-2000
    #Start year
    #Opening dropdown of starting year
    driver.find_element_by_xpath('//*[@id="s2id_autogen4"]/a').click()
    #Select year from dropdown
    a=a1+b
    driver.find_element_by_xpath('//*[@id="select2-results-5"]/li[%d]'%a).click()
    #Opening dropdown of ending year
    driver.find_element_by_xpath('//*[@id="s2id_autogen6"]/a').click()
    #Select year from dropdown
    a=a2-b
    driver.find_element_by_xpath('//*[@id="select2-results-7"]/li[%d]'%a).click()
    return

In [ ]:
def download_page(driver,page_no,file_format):
    """
    Function to download page results
    """
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="summary_navigation"]/table/tbody/tr/td[2]/input')))
    page=driver.find_element_by_xpath('//*[@id="summary_navigation"]/table/tbody/tr/td[2]/input')
    page.clear()
    page.send_keys(page_no)
    page.send_keys(Keys.RETURN)
    #Open dropdown menu to save
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="s2id_saveToMenu"]/a/span[2]/b')))
    driver.find_element_by_xpath('//*[@id="s2id_saveToMenu"]/a/span[2]/b').click()
    #Click to other file formats
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="select2-results-1"]/li[5]')))
    driver.find_element_by_xpath('//*[@id="select2-results-1"]/li[5]').click()
    #Full Record
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.ID,'bib_fields')))
    driver.find_element_by_id('bib_fields').click()
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.ID,'bib_fields:fullrec_fields_option')))
    driver.find_element_by_id('bib_fields:fullrec_fields_option').click()
    #Choose Plain Text format
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.ID,'saveOptions')))
    dropdown = driver.find_element_by_id('saveOptions')
    Select(dropdown).select_by_visible_text(file_format)
    #Click on send button
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ui-id-7"]/form/div[4]/span/input')))
    driver.find_element_by_xpath('//*[@id="ui-id-7"]/form/div[4]/span/input').click()
    #Click on close button after download
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ui-id-7"]/form/div[2]/a')))
    driver.find_element_by_xpath('//*[@id="ui-id-7"]/form/div[2]/a').click()
    return

In [ ]:
def webofscience(username,password,keywords,year,pages,file_format):
    """
    Function to download results from Compendex
    """
    #Initializing driver
    driver = webdriver.Chrome()
    driver.get("http://login.webofknowledge.com")
    #WOS Login
    wos_login(driver,username,password)
    #Add Keywords
    keyword_search(driver,keywords)
    #Add Year
    select_year(driver,year)
    #Click on search
    a=9
    b=8-len(keywords)
    a=a-b
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="searchCell%d"]'%a)))
    driver.find_element_by_xpath('//*[@id="searchCell%d"]'%a).click()
    #Increase page size to 50 results per page
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="s2id_selectPageSize_.bottom"]/a')))
    driver.find_element_by_xpath('//*[@id="s2id_selectPageSize_.bottom"]/a').click()
    WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="select2-results-5"]/li[3]')))
    driver.find_element_by_xpath('//*[@id="select2-results-5"]/li[3]').click()
    for i in pages:
        download_page(driver,i,file_format)
    return

In [ ]:
#Registered User Email Address
username = '<Email Address>'
#Registered User Password
password = '<Password>'
#List of keywords
keywords = ['keyword 1','keyword 2']
#Year
year = 2016
#Pages to download (you can use range function here)
pages = [1]
#File format
#It can be 'BibTex', 'HTML', 'Plain Text', 'Tab-delimited (Win)', 'Tab-delimited (Mac)', 
#'Tab-delimited (Win, UTF-8)' and 'Tab-delimited (Win, UTF-8)'
file_format = 'Plain Text'
#Download Results
webofscience(username,password,keywords,year,pages,file_format)