In [1]:
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._
In [2]:
val records = ArchiveSpark.load(WarcSpec.fromFilesWithCdx("/data/helgeholzmann-de.warc.gz"))
In [3]:
val html = records.filter(r => r.mime == "text/html" && r.status == 200)
In [4]:
val Title = HtmlText.of(Html.first("title"))
In [5]:
html.enrich(Title).peekJson
Out[5]:
In [ ]: