In [1]:
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

In [2]:
val records = ArchiveSpark.load(WarcSpec.fromFilesWithCdx("/data/helgeholzmann-de.warc.gz"))

In [3]:
val html = records.filter(r => r.mime == "text/html" && r.status == 200)

In [4]:
val Title = HtmlText.of(Html.first("title"))

In [5]:
html.enrich(Title).peekJson


Out[5]:
{
    "record" : {
        "redirectUrl" : "-",
        "timestamp" : "20190528152652",
        "digest" : "sha1:HCHVDRUSN7WDGNZFJES2Y4KZADQ6KINN",
        "originalUrl" : "https://www.helgeholzmann.de/",
        "surtUrl" : "de,helgeholzmann)/",
        "mime" : "text/html",
        "compressedSize" : 2087,
        "meta" : "-",
        "status" : 200
    },
    "payload" : {
        "string" : {
            "html" : {
                "title" : {
                    "text" : "Helge Holzmann - @helgeho"
                }
            }
        }
    }
}

In [ ]: