In [1]:

    
#!/usr/bin/env python
# Written by Weitao Zhou <zhouwtlord@gmail.com>

import hashlib
import urllib2
import optparse
import os
import Queue
import re
import socket
import sys
import threading
import time

__version__ = '0.1'

USAGE = "%prog [options] <urls_input_file> <target_dir> <thread_num> <thread_delay> <timeout>"
ARGS_MIN_LENGTH = 5
VERSION = "%prog v" + __version__

#RE_PATTERN = r'[\w]{1,}'
RE_PATTERN = r'[a-zA-Z0-9]{1,}'

HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}


class CrawlCount(threading.Thread):
    
    def __init__(self, queue, target, delay):
        threading.Thread.__init__(self)
        self.queue = queue
        self.target = target
        self.delay = delay
    
    def run(self):
        queue_non_empty = True
        while queue_non_empty:
            try:
                url = self.queue.get(False)
                fetcher = Fetcher(url)
                if fetcher.fetch() == False:
                    # Fetch Failed
                    continue
                content = fetcher.content 
                
                wordcount = WordCount(content, RE_PATTERN)
                wordcount.count()   
                
                compose = ['%s=%s' % item for item in wordcount.word_count.items()]
                
                # TODO
                # 1. how to avoid duplicated filename caused by duplicated url
                # 2. As linux can't accept '/' as filename, hash the url to solve it
                filename = hashlib.sha256(url).hexdigest()
                with open(os.path.join(self.target, filename), 'w') as outf:
                    outf.write('\n'.join(compose))
                
                time.sleep(self.delay)
            except Queue.Empty:
                queue_non_empty = False


class Fetcher(object):

    def __init__(self, url):
        self.url = url
        self.content = None

    def _addHeaders(self, request):
        for key, value in HEADERS.items():
            request.add_header(key, value)

    def open(self):
        url = self.url
        try:
            print 'Fetching url %s' % url
            request = urllib2.Request(url)
            handle = urllib2.build_opener()
        except IOError:
            return None
        return (request, handle)

    def fetch(self):
        request, handle = self.open()
        self._addHeaders(request)
        if handle:
            try:
                self.content = unicode(handle.open(request).read(), "utf-8", errors="replace")
                return True
            except urllib2.HTTPError, error:
                if error.code == 404:
                    print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
                else:
                    print >> sys.stderr, "ERROR: %s" % error
                return False
            except urllib2.URLError, error:
                print >> sys.stderr, "ERROR: %s" % error
                return False
            except socket.timeout, error:
                print >> sys.stderr, "ERROR: %s" % error
                return False
        return False


class WordCount(object):
    
    def __init__(self, text, re_pattern):
        self.re_pattern = re_pattern
        self.text = text
        self.word_count = dict()
        
    def pre_process(self):
        # Pre process the inputted Text:
        # 1. remove punctuation; 2. lower case;
        # Note: Given interview deadline, here I didn't involve tokenize, stemming, stopwords and other NLP pre-process methods
        word_list = self._remove_punctuation()
        return self._lower_case(word_list)
    
    def _remove_punctuation(self):
        print 'Removing punctuation'
        word_list = re.findall(self.re_pattern, self.text)
        return word_list
    
    def _lower_case(self, word_list):
        print 'Lowering case'
        return [word.lower() for word in word_list]
    
    def count(self):
        print 'Word Counting'
        word_list = self.pre_process()
        for word in word_list:
            self.word_count[word] = self.word_count.get(word, 0) + 1

def parse_options(args_min_length):
    """parse_options() -> opts, args

    Parse any command-line options given returning both
    the parsed options and arguments.
    """

    parser = optparse.OptionParser(usage=USAGE, version=VERSION)


    opts, args = parser.parse_args()

    if len(args) < args_min_length:
        parser.print_help()
        raise SystemExit, 1

    return opts, args

def main():
    opts, args = parse_options(ARGS_MIN_LENGTH)
    
    urls_inf = args[0]
    target_dir = args[1]
    thread_num = int(args[2])
    thread_delay = int(args[3])
    socket_timeout = int(args[4])
    
    #set global timeout of socket
    socket.setdefaulttimeout(socket_timeout)
    
    splite = ['=']*20
    splite[10] = 'START'
    start_time = time.time()
    print ''.join(splite)

    q = Queue.Queue()
    with open(urls_inf, 'r') as urls:
        for url in urls:
            q.put(url)

            
    threads = []
    # multiple thread
    for i in range(thread_num):
        t = CrawlCount(q, target_dir, thread_delay)
        t.start()
        threads.append(t)
    
    for t in threads:
        t.join()
    
    end_time = time.time()
    due = end_time-start_time
    print 'Used time: %s' % due
    splite[10] = 'END'
    print ''.join(splite)


if __name__ == "__main__":
    main()









    



An exception has occurred, use %tb to see the full traceback.

SystemExit: 2






    



Usage: -c [options] <urls_input_file> <target_dir> <thread_num> <thread_delay> <timeout>

-c: error: no such option: -f
To exit: use 'exit', 'quit', or Ctrl-D.