In [ ]:
#let's make it first without imports just to see if it works
workspace()
type Parser
baseUrl::Int64
links::Array{Int64}
end
function handle_starttag(parser::Parser, tag, attrs)
if tag =='a'
for (key, value) in attrs
if key=='href'
newUrl = parse.urljoin(baseUrl, value)
append!(links, newUrl)
end
end
end
end
function getLinks(parser::Parser, url)
response = urlopen(parser.baseUrl) # almost certanily not best place to put it
if response.getheader('Content-Type') == 'test/html'
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
feed(htmlString)
return htmlString, links
else
return "",[]
end
end
#what does thsi even do!? basically we need to figure out the logic by ourself to be honest
#it doesn't even make sense. we'll try to copy the python in julia though and hope for the best
function spider(url, word, maxPages)
pagesToVisit = url
num_visisted = 0
foundWord = false
end
# basically we just need to have really efficient datastructures and stuff handle the parsing
#but we straight up don't. this seems like the sort of task that julia should be realy good at though
# loop handling and adding stuff to mutable lists. stuff like that
# julia should be great at it, as it's just a mega recursive list with very defined types
#shouldn't be difficult at all really
In [ ]:
function crawl()
#dagnabbit I hate julia and I'm not confident at all in it. and it has really cryptic errors
#compared to python. but it's a lot faster still very easy to develop in
#and also vastly faster and cooler than python which is pretty boring
#doing a simple web crawler in julia should be a fun little project
#and it doesn't seem to exist so it could be a fun project also
# in our spare time
#so whhat do we need this crawler to do, just somehow parse the text, check for things, and hope for the bst
In [ ]:
In [ ]:
workspace()
print("done")
In [ ]:
# okay, let's actually try and do this my own way, so we know it makes sense and we can figure it out
# let's figure out just what a web scraper actually has to do here. it shouldn't be THAT hard
# we just need a start url, a list of all websites taken, a list of all websites still to go, and a dictionary of urls to content
# that shouldn't be so difficult really. oh ewll, not too hard
#imports
using Requests
#whatever else?
using HTTP
using Gumbo
using AbstractTrees
using ArgParse
type Crawler
startUrl::AbstractString
urlsVisited::Array{AbstractString}
urlsToCrawl::Array{AbstractString}
content::Dict{AbstractString, AbstractString}
#the content is dictoianry{url: html content}
breadthFirst::Bool
#constructors
function Crawler(starturl::AbstractString)
return new(starturl, AbstractString[], AbstractString[],Dict{AbstractString, AbstractString}(), true)
end
function Crawler(starturl::AbstractString, breadthfirst::Bool)
return new(starturl, AbstractString[],AbstractString[],Dict{AbstractString, AbstractString}(), breadthfirst)
end
function Crawler(starturl::AbstractString, urlstocrawl::Array{AbstractString},breadthfirst::Bool)
return new(starturl, AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), breadthfirst)
end
function Crawler(starturl::AbstractString, urlstocrawl::Array{AbstractString})
return new(starturl, AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), true)
end
#remove this, just a test
function Crawler(urlstocrawl::Array{AbstractString}, breadthfirst::Bool)
return new("", AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), breadthfirst)
end
function Crawler(urlstocrawl::Array{AbstractString})
return new("", AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), true)
end
end
print("sorted!")
# okay, that's us kind of sorted. nowe we shuold get some stuff. let's just run this, see if it works lol
#that gives us a huge list of errors I don't understand at all... oh well. that's pretty unfortunate really
# so it goes, I suppose
In [ ]:
# okay, now for our actual functoins. let's first write this as a crazy monolithic one so we understand
#we also need a monolithic one so we can figure out how it works
#and write sufficient constructors, so let's do that
#let's try creating some crawlers
crawler = Crawler("http://google.com")
const SuccessCode=200
#crawler = Crawler()
# okay, creation works. that's good. now let's get our function with the mega loop
#with our default num_iterations number. that's not going to be too tricky, hopefully
function crawl(crawler::Crawler, num_iterations::Integer=10, verbose=true, save=true)
#first we check if it's the first thing we see. so we should just check this
#shall we just define variables from the crawler? nah, let's not. we should just access them consistently
#as it's meant t be updated in place, I assume
#we should dump this in thefucntoin so it doesn't slow down
const successCode = 200
#our immediate return if correct
if isempty(crawler.urlsToCrawl) && crawler.startUrl==""
return crawler.content, crawler.urlsVisited
end
if isempty(crawler.urlsToCrawl) && crawler.startUrl!=""
#so we are at the beginning so we visit our first piece
#we set the starturl to urls to crawl
push!(crawler.urlsToCrawl,crawler.startUrl)
crawler.startUrl=""
end
#okay, now we begin the loop
for i in 0:num_iterations
#we check if empty we probably shouldn't do this on each iteratino, but oh well!
if isempty(crawler.urlsToCrawl) && crawler.startUrl==""
return crawler.content, crawler.urlsVisited
end
url = pop!(crawler.urlsToCrawl)
#we get the content
#we make the request with http
#we first check this works... idk
#println(crawler.urlsVisited)
#println(crawler.urlsToCrawl)
if !(url in crawler.urlsVisited)
if verbose==true
println("requesting $url")
end
try
response = HTTP.get(url)
#println(response)
#check success code and procede if correct
if response.status==successCode
# okay, here's what we do. we do our parsing string here
res = String(response.body)
doc = parsehtml(res)
if verbose == true
println("response received and is successful")
end
#if we succeed we update our links
crawler.content[url] = res
#print(typeof(url))
# println("")
# println("type of crawler.urlsvisited ", typeof(crawler.urlsVisited))
# println("url: ", url)
# println(crawler.urlsVisited)
push!(crawler.urlsVisited, url)
#we go through all elements get links
for elem in PostOrderDFS(doc.root)
if typeof(elem) == Gumbo.HTMLElement{:a}
link=get(elem.attributes, "href","#")
if link != "#"
#then it's succeeded we have link
# println(typeof(link))
push!(crawler.urlsToCrawl, link)
end
end
end
end
if url in crawler.urlsToCrawl
println("repeat url")
num_iterations +=1
end
end
end
end
#now once we're finished our loop
#we return stuff and save
if save==true
#we save the files somewhere
end
return crawler.content, crawler.urlsVisited
end
# okay, yes! we actually have http parsing strings working. that's just so awesome! we can do this
In [ ]:
# this is our arg parse settings so we can see if it works and write it up properly
function parse_commandline()
s = ArgParseSettings(prog="Julia Web Crawler",
description="A webcrawler written in Julia",
commands_are_required=false,
version="0.0.1",
add_version=true)
@add_arg_table s begin
"--urls"
help="either a url to start at or a set of urls to start visiting"
required=true
"--breadth-first", "--b"
help="a flag or whether the crawler should search depth or breadth first"
action=:store_true
default = true
"--num_iterations", "--i"
help="the number of iteratinos to run the crawler for"
default=10
end
return parse_args(s)
end
function setupCrawler(urls, b=true)
return Crawler(urls, b)
end
# at some point we should add functionality like a place to put stuff, or idk really
# who even knows
#not sure where we should put save files also tbh
#as that seems quite important really
#and idk
#and a scheduler for having multiple parallel crawlers, as they don't need to interact
#julia should be good at that also hopefully!
function main()
#parsed_args= parse_commandline()
#now we get our args
# urls = parsed_args["urls"]
#breadth_first = parsed_args["breadth-first"]
#we generate thecrawler, and then roll, baby
#num_iterations = parsed_args["num_iterations"]
#obviously in the notebook we have no argumentsto parse, so we just do this
urls="http://stackoverflow.com"
breadth_first=true
crawler= Crawler(urls, breadth_first)
num_iterations = 20
crawl(crawler, num_iterations)
end
#main()
In [ ]:
a = 2
if a ==2
print("yes")
end
for i in 1:10
print(i)
print(typeof(i))
end
In [ ]:
Pkg.add("HTTP")
using HTTP
url="https://stackoverflow.com/questions/37360340/how-to-pass-dict-as-the-argument-to-method-julia"
response = HTTP.get(url)
print(response.body)
In [ ]: