In [1]:
install.packages("progress")


Installing package into ‘/srv/rlibs’
(as ‘lib’ is unspecified)
also installing the dependency ‘prettyunits’


In [2]:
# bulk download scans from szukajwarchiwach.pl
# id = signature
# download = FALSE (default) will download no files
# returns list of scan image URLS
# some troubles with ssl certificate

swa_get_scans <- function (id,download=FALSE) {
  
  library(httr)
  library(xml2)
  library(progress)
  
  # building base URL with id
  url <- paste0("https://szukajwarchiwach.pl/",id,"/#tabSkany")
  
  # how to avoid certificate errors with SSL
  httr::set_config(config(ssl_verifypeer = 0L))
  
  # parsing base URL
  u <- GET(url)
  u <- content(u, "text")
  u <- read_html(u, encoding = "UTF-8")
  u <- xml_find_all(u, "//div[@class='pagerBox']")
  u <- xml_find_all(u, '//a[position()>1]/text()')
  u <- as_list(u)
  u <- unlist(tail(u, n=1))
  
  # parsing u pages
  
  images <- character()
  
  for(i in 1:u){
    i <- paste0("https://szukajwarchiwach.pl/",id,"/str/1/",i,"/15#tabSkany")
    p <- GET(i)
    p <- content(p, "text")
    p <- read_html(p, encoding = "UTF-8")
    p <- xml_find_all(p, "//div[@class='searchListBg']")
    p <- xml_find_all(p, '//a[@class="fancy_scan_link"]/@href')
    p <- as_list(p)
    p <- unlist(p)
    images <- append(images, p, after=length(images))
  }
  
  # preparing proper urls for scan images
  
  images <- lapply(images, gsub, pattern="^/", replacement = "https://szukajwarchiwach.pl/")
  images <- lapply(images, gsub, pattern="medium", replacement = "img",fixed=TRUE)
  
  # if download
  
  if(download==TRUE){
    
    
    dir.create(gsub("/","-",id,fixed = TRUE))
    a <- 1
    pb <- progress_bar$new(total = length(images))
    
    for(i in images){
      # if you prefer using httr GET rather than download.file to overcome SSL errors (rather slow method)
      # GET(url=i, write_disk(paste0(gsub("/","-",id,fixed = TRUE),"/",a,".jpg")))
      # or if you want to use download.file with wget --no-check-certificate (faster)
      download.file(url=i, paste0(gsub("/","-",id,fixed = TRUE),"/",a,".jpg"), method="wget", quiet= TRUE, extra = "--no-check-certificate")
      a <- a + 1
      # progress bar
      pb$tick()
      Sys.sleep(sample(3:6,1))
    }
    
  }
  
  # at least return images urls
  return(unlist(images))
  
}

In [7]:
# let's try:
# https://szukajwarchiwach.pl/65/356/0/1.1/44#tabSkany
scans <- swa_get_scans("65/356/0/1.1/44", download=TRUE)

In [8]:
scans


  1. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/4tU7herNt3Ia-i1Xoweoug'
  2. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/d_QCv5_PCmv8U-leCfppQw'
  3. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/5uswryJu0ApzYVcOF6zUbw'
  4. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/7s85QBJf_PlVRoC0YyPhSQ'
  5. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/hSeZeC8-TYbTYBA3GioEiA'
  6. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/BmI6c_3tRbCt4vKfa38kYw'
  7. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/Pbz5-cHU5JC5H0rQrJHRig'
  8. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/yD_N2CH4hl_7FF3PNvsoAg'
  9. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/qXhROK2lSnsXNqQlYqjaFQ'
  10. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/qM7eXetx3fCZ0hnAzNodVw'
  11. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/b3gEScGGb_KFG1a7oXA2SQ'
  12. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/fP1MCdwILeZhBKTtX1xMkA'
  13. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/Cc-9TDVroQ4M9I5ziBet-g'
  14. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/HGTD47bj7Bm4Y-O9V3Gxig'
  15. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/9-yA-Q5H5oP0bHTCAUWx9A'
  16. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/SX3RytQUOfAT-acZcKUaNQ'
  17. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/gpOL5IJbOb2HffQlwm1cYQ'
  18. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/ntP5qVh00anymNp2qgDJRw'
  19. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/qnl8EIiYVjHZYan6MeL4RA'
  20. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/8b9kMg9ZB5x5hG2DvH_-0w'
  21. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/TwcDQhCSiq4f-ecs4zal7A'
  22. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/mBfJtQih5RNsx4j6UahWQw'
  23. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/yIQKr5iQOANFJpZ9TFhPAw'
  24. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/dj_F3eF6JFDmw4q8LfIBNA'
  25. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/olqSSNOVjah8uRDNCycMug'
  26. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/a2i-Ok6GjJqdeljGrlRrOQ'
  27. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/t0n9Ng-H1mMfsxej3Bu_sw'
  28. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/TDq-ZPkV34DdrvisxIldFA'
  29. 'https://szukajwarchiwach.pl/65/356/0/1.1/44/skan/img/vpGMU9Xu2UfauDr3H3leDg'

In [14]:
# aby podejrzeć pliki, wybierz menu File i opcję Open
# (podgląd: https://i.stack.imgur.com/1H0DO.png)
# pliki znajdziesz w katalogu 65/356/0/1.1/44
list.files('65-356-0-1.1-44')


  1. '1.jpg'
  2. '10.jpg'
  3. '11.jpg'
  4. '12.jpg'
  5. '13.jpg'
  6. '14.jpg'
  7. '15.jpg'
  8. '16.jpg'
  9. '17.jpg'
  10. '18.jpg'
  11. '19.jpg'
  12. '2.jpg'
  13. '20.jpg'
  14. '21.jpg'
  15. '22.jpg'
  16. '23.jpg'
  17. '24.jpg'
  18. '25.jpg'
  19. '26.jpg'
  20. '27.jpg'
  21. '28.jpg'
  22. '29.jpg'
  23. '3.jpg'
  24. '4.jpg'
  25. '5.jpg'
  26. '6.jpg'
  27. '7.jpg'
  28. '8.jpg'
  29. '9.jpg'

In [ ]: