notebook.community

Edit and run



In [1]:

    
download("http://www.gutenberg.org/cache/epub/100/pg100.txt", "shakespeare.txt")









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1050  100  1050    0     0   5326      0 --:--:-- --:--:-- --:--:--  5357






    Out[1]:





"bookText.txt"



In [3]:

    
;ls -l shakespeare.txt









    



bash: no job control in this shell






    



-rw-r--r-- 1 ec2-user ec2-user 5582295 Nov  9 06:18 shakespeare.txt



In [2]:

    
# "Map" function.
# Takes a string. Returns a HashTable with the number of times each word 
# appears in that string.
function wordcount(text)
    words=split(text,(' ','\n','\t','-','.',',',':','_','"',';','!'),false)
    counts=HashTable()
    for w = words
        counts[w]=get(counts,w,0)+1
    end
    return counts
end









    Out[2]:





wordcount (generic function with 1 method)



In [4]:

    
# "Reduce" function.
# Takes a collection of HashTables in the format returned by wordcount()
# Returns a HashTable in which words that appear in multiple inputs
# have their totals added together.
function wcreduce(wcs)
    counts=HashTable()
    for c = wcs
        for (k,v)=c
            counts[k] = get(counts,k,0)+v
        end
    end
    return counts
end









    Out[4]:





wcreduce (generic function with 1 method)



In [6]:

    
# Splits input string into nprocs() equal-sized chunks (last one rounds up), 
# and @spawns wordcount() for each chunk to run in parallel. Then fetch()s
# results and performs wcreduce().
# Limitations: splitting the string and reduction step are single-threaded.
function parallel_wordcount(text)
    lines=split(text,'\n',false)
    np=nprocs()
    unitsize=ceil(length(lines)/np)
    wcounts={}
    rrefs={}
    # spawn procs
    for i=1:np
        first=unitsize*(i-1)+1
        last=unitsize*i
        if last>length(lines)
            last=length(lines)
        end
        subtext=join(lines[int(first):int(last)],"\n")
        push(rrefs, @spawn wordcount( subtext ) )
    end
    # fetch results
    while length(rrefs)>0
        push(wcounts,fetch(pop(rrefs)))
    end
    # reduce
    count=wcreduce(wcounts)
    return count
end









    Out[6]:





parallel_wordcount (generic function with 1 method)



In [9]:

    
# Takes the name of a result file, and a list of input file names.
# Combines the contents of all files, then performs a parallel_wordcount
# on the resulting string. Writes the results to result_file.
# Limitation: Performs all file IO single-threaded.
function wordcount_files(result_file,input_file_names...)
    text=""
    for f = input_file_names
        fh=open(f)
        text=join( {text,readall(fh)}, "\n" )
        close(fh)
    end
    wc=parallel_wordcount(text)
    fo=open(result_file,"w")
    for (k,v) = wc
        with_output_stream(fo,println,k,"=",v)
    end
end









    Out[9]:





wordcount_files (generic function with 1 method)



In [30]:

    
text=""
fh=open("bookText.txt")
text=join( {text,readall(fh)}, "\n" )
close(fh)

text









    Out[30]:





"\n<!DOCTYPE HTML>\n<html><head><title>Error 403</title></head>\n<body>\n<h1>Error 403</h1>\n\n<p>To fix this error:</p>\n\n<ul>\n  <li>Don't use anonymizers, open proxies, VPNs, or TOR to access Project Gutenberg.</li>\n  <li>Don't access Project Gutenberg from hosted servers.</li>\n  <li>Don't use automated software to download lots of books. We have a limit on how fast you can go while using this site. If you surpass this limit you get blocked for 24h.</li>\n  <li>We have a daily limit on how many books you can download. If you exceeded this limit you get blocked for 24h.</li>\n</ul>\n\n<p>\nIf you are sure that none of the above applies to you, \nand wish us to investigate the problem,\nwe need to know your IP address.\nGo to <a href=\"http://www.whatismyip.com/\">this site</a>,\ndon't sign up, \njust copy the IP address \n(it looks like: 12.34.56.78 but your numbers will be different)\nand\n<a href=\"mailto:webmaster@gutenberg.org?subject=403%20help\">mail it to us</a>.\nIf that page also shows a proxy address, we need that one instead. \n</p>\n\n</body>\n</html>\n"



In [ ]: