notebook.community

Edit and run



In [ ]:

    
#let's make it first without imports just to see if it works
workspace()
type Parser
    baseUrl::Int64
    links::Array{Int64}
end


function handle_starttag(parser::Parser, tag, attrs)
    if tag =='a'
        for (key, value) in attrs
            if key=='href'
                newUrl = parse.urljoin(baseUrl, value)
                append!(links, newUrl)
            end
        end
    end
end

function getLinks(parser::Parser, url)
    response = urlopen(parser.baseUrl) # almost certanily not best place to put it
    if response.getheader('Content-Type') == 'test/html'
        htmlBytes = response.read()
        htmlString = htmlBytes.decode("utf-8")
        feed(htmlString)
        return htmlString, links
    else
        return "",[]
    end
end
#what does thsi even do!? basically we need to figure out the logic by ourself to be honest
#it doesn't even make sense. we'll try to copy the python in julia though and hope for the best

function spider(url, word, maxPages)
    pagesToVisit = url
    num_visisted = 0
    foundWord = false
end

# basically we just need to have really efficient datastructures and stuff handle the parsing
#but we straight up don't. this seems like the sort of task that julia should be realy good at though
# loop handling and adding stuff to mutable lists. stuff like that
# julia should be great at it, as it's just a mega recursive list with very defined types
#shouldn't be difficult at all really



In [ ]:

    
function crawl()
    #dagnabbit I hate julia and I'm not confident at all in it. and it has really cryptic errors
    #compared to python. but it's a lot faster still very easy to develop in
    #and also vastly faster and cooler than python which is pretty boring
    #doing a simple web crawler in julia should be a fun little project
    #and it doesn't seem to exist so it could be a fun project also
    # in our spare time
    
    #so whhat do we need this crawler to do, just somehow parse the text, check for things, and hope for the bst



In [ ]:



In [ ]:

    
workspace()
print("done")



In [ ]:

    
# okay, let's actually try and do this my own way, so we know it makes sense and we can figure it out
# let's figure out just what a web scraper actually has to do here. it shouldn't be THAT hard
# we just need a start url, a list of all websites taken, a list of all websites still to go, and a dictionary of urls to content
# that shouldn't be so difficult really. oh ewll, not too hard

#imports
using Requests
#whatever else?
using HTTP
using Gumbo
using AbstractTrees
using ArgParse

type Crawler
    startUrl::AbstractString
    urlsVisited::Array{AbstractString}
    urlsToCrawl::Array{AbstractString}
    content::Dict{AbstractString, AbstractString}
    #the content is dictoianry{url: html content}
    breadthFirst::Bool
    
    #constructors
    function Crawler(starturl::AbstractString)
        return new(starturl, AbstractString[], AbstractString[],Dict{AbstractString, AbstractString}(), true)
    end
    function Crawler(starturl::AbstractString, breadthfirst::Bool)
        return new(starturl, AbstractString[],AbstractString[],Dict{AbstractString, AbstractString}(), breadthfirst)
    end
    function Crawler(starturl::AbstractString, urlstocrawl::Array{AbstractString},breadthfirst::Bool)
        return new(starturl, AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), breadthfirst) 
    end
    function Crawler(starturl::AbstractString, urlstocrawl::Array{AbstractString})
        return new(starturl, AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), true)
    end
    #remove this, just a test
    function Crawler(urlstocrawl::Array{AbstractString}, breadthfirst::Bool)
        return new("", AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), breadthfirst)
    end
    function Crawler(urlstocrawl::Array{AbstractString})
        return new("", AbstractString[], urlstocrawl, Dict{AbstractString, AbstractString}(), true)
    end
    
end

print("sorted!")

# okay, that's us kind of sorted. nowe we shuold get some stuff. let's just run this, see if it works lol
#that gives us a huge list of errors I don't understand at all... oh well. that's pretty unfortunate really
# so it goes, I suppose



In [ ]:

    
# okay, now for our actual functoins. let's first write this as a crazy monolithic one so we understand
#we also need a monolithic one so we can figure out how it works
#and write sufficient constructors, so let's do that
#let's try creating some crawlers
crawler = Crawler("http://google.com")
const SuccessCode=200
#crawler = Crawler()
# okay, creation works. that's good. now let's get our function with the mega loop
#with our default num_iterations number. that's not going to be too tricky, hopefully
function crawl(crawler::Crawler, num_iterations::Integer=10, verbose=true, save=true)
    #first we check if it's the first thing we see. so we should just check this
    #shall we just define variables from the crawler? nah, let's not. we should just access them consistently
    #as it's meant t be updated in place, I assume
    #we should dump this in thefucntoin so it doesn't slow down
    const successCode = 200
    
    #our immediate return if correct
    if isempty(crawler.urlsToCrawl) && crawler.startUrl==""
        return crawler.content, crawler.urlsVisited
    end
    

    if isempty(crawler.urlsToCrawl) && crawler.startUrl!=""
        #so we are at the beginning so we visit our first piece
        #we set the starturl to urls to crawl
        push!(crawler.urlsToCrawl,crawler.startUrl)
        crawler.startUrl=""
    end
    
    #okay, now we begin the loop
    for i in 0:num_iterations
        #we check if empty we probably shouldn't do this on each iteratino, but oh well!
        if isempty(crawler.urlsToCrawl) && crawler.startUrl==""
            return crawler.content, crawler.urlsVisited
        end
        url = pop!(crawler.urlsToCrawl)
        #we get the content
        #we make the request with http
        #we first check this works... idk
        #println(crawler.urlsVisited)
        #println(crawler.urlsToCrawl)
        if !(url in crawler.urlsVisited)

            if verbose==true
                println("requesting $url")
            end 
            try
                
                response = HTTP.get(url)
                #println(response)
                #check success code and procede if correct
                if response.status==successCode
                    # okay, here's what we do. we do our parsing string here
                    res = String(response.body)
                    doc = parsehtml(res)
                    if verbose == true
                        println("response received and is successful")
                    end

                    #if we succeed we update our links
                    crawler.content[url] = res

                    #print(typeof(url))
                   # println("")
                   # println("type of crawler.urlsvisited ", typeof(crawler.urlsVisited))
                   # println("url: ", url)
                   # println(crawler.urlsVisited)
                    push!(crawler.urlsVisited, url)

                    #we go through all elements get links
                    for elem in PostOrderDFS(doc.root)
                        if typeof(elem) == Gumbo.HTMLElement{:a}
                            link=get(elem.attributes, "href","#")
                            if link != "#"
                                #then it's succeeded we have link
                               # println(typeof(link))
                                push!(crawler.urlsToCrawl, link)
                            end
                        end
                    end
                end
                if url in crawler.urlsToCrawl
                    println("repeat url")
                    num_iterations +=1
                end
            end
        end
    end
    
    #now once we're finished our loop
    #we return stuff and save
    
    if save==true
        #we save the files somewhere
    end
    return crawler.content, crawler.urlsVisited
end

# okay, yes! we actually have http parsing strings working. that's just so awesome! we can do this



In [ ]:

    
# this is our arg parse settings so we can see if it works and write it up properly

function parse_commandline()

    s = ArgParseSettings(prog="Julia Web Crawler",
                        description="A webcrawler written in Julia",
                        commands_are_required=false,
                        version="0.0.1",
                        add_version=true)

    @add_arg_table s begin
            "--urls"
                help="either a url to start at or a set of urls to start visiting"
                required=true
            "--breadth-first", "--b"
                help="a flag or whether the crawler should search depth or breadth first"
                action=:store_true
                default = true
            "--num_iterations", "--i"
                help="the number of iteratinos to run the crawler for"
                default=10
    end
    return parse_args(s)
end

function setupCrawler(urls, b=true)
    return Crawler(urls, b)
end

# at some point we should add functionality like a place to put stuff, or idk really
# who even knows
#not sure where we should put save files also tbh
#as that seems quite important really
#and idk
#and a scheduler for having multiple parallel crawlers, as they don't need to interact
#julia should be good at that also hopefully!


function main()
    #parsed_args= parse_commandline()
    #now we get our args
   # urls = parsed_args["urls"]
    #breadth_first = parsed_args["breadth-first"]
    #we generate thecrawler, and then roll, baby
    #num_iterations = parsed_args["num_iterations"]
    
    #obviously in the notebook we have no argumentsto parse, so we just do this
    urls="http://stackoverflow.com"
    breadth_first=true
    crawler= Crawler(urls, breadth_first)
    num_iterations = 20
    crawl(crawler, num_iterations)
end

#main()



In [ ]:

    
a = 2
if a ==2
    print("yes")
end

for i in 1:10
    print(i)
    print(typeof(i))
end



In [ ]:

    
Pkg.add("HTTP")
using HTTP

url="https://stackoverflow.com/questions/37360340/how-to-pass-dict-as-the-argument-to-method-julia"
response = HTTP.get(url)
print(response.body)



In [ ]: