notebook.community

Edit and run



In [92]:

    
import fileinput
import json
import csv
import sys
import urllib
import urlparse
import time

class TopsyCrawler:
    """
    Crawl Topsy
    """
    API_KEY = ""
    base_url = "http://otter.topsy.com/search.json?q="
    base_params = {}
    csvHeadersWritten = False
    
    def __init__(self, api_key):
        self.API_KEY = api_key
    
    def queryBuilder(self, **kwargs):
        #Reference: http://stackoverflow.com/a/2506477/2762836
        """
        Builds complete query string to be used to query Topsy API
        Parameters:
        
        Window defines the time frame which is to be searched for tweets
        window=h (last 1 hour)
        window=d (last 1 day)
        window=d5 (last 5 days)
        window=w (last 7 days)
        window=m (last month)
        window=a (all time)
        
        """
        self.base_params = kwargs
        url_parts = list(urlparse.urlparse(self.base_url))
        query = dict(urlparse.parse_qsl(url_parts[4]))
        query.update(self.base_params)
        url_parts[4] = urllib.urlencode(query)
        initial_url = urlparse.urlunparse(url_parts)
        print initial_url
        sys.stdout.flush()
        return initial_url
    
    def crawlUrl(self, url):
        """
        gets results from querying the url
        """
        return json.loads(urllib.urlopen(url).readlines()[0]) #return json output
    
    def fetchTweets(self, maxTweetNumber, delayPerRequest, writeFileHandle, folderName, **kwargs):
        """
        fetches tweets until the number of tweets fetched exceeds 'maxTweetNumber'
        'delayPerRequest' is the time in seconds to wait before making next request.
        """
        #Build first query
        url = self.queryBuilder(**kwargs)
        #First first page of results
        resultObj = {}
        resultObj = self.crawlUrl(url)
        processedResult = ResultsProcessor(resultObj)
        self.writeJsonToCsv(processedResult, resultObj, writeFileHandle, 0, kwargs['q'], folderName)
        self.csvHeadersWritten = True
        offset = processedResult.offset
        nextOffset = offset+10
        noOfTweetsFetched = len(processedResult.response['list'])
        while True:
            #Check if condition to exit the loop is met
            if noOfTweetsFetched > maxTweetNumber:
                break
            if len(processedResult.response['list']) == 0:
                break
            #Wait for sometime before next request
            time.sleep(delayPerRequest)
            #Query the url
            url = self.queryBuilder(apikey=self.base_params['apikey'], type=self.base_params['type'], window=self.base_params['window'], q=self.base_params['q'], offset=nextOffset)
            resultObj = self.crawlUrl(url)
            processedResult = ResultsProcessor(resultObj)
            self.writeJsonToCsv(processedResult, resultObj, writeFileHandle, nextOffset, kwargs['q'], folderName)
            
            #Book keeping for processing next result
            nextOffset = nextOffset+10
            noOfTweetsFetched = len(processedResult.response['list']) + noOfTweetsFetched
            
    def writeJsonToCsv(self, jsonData, jsonRawData, writeFile, offset, queryTags, folderName):
        """
        Used to write tweets data to csv file
        writeFileHandle is False if output is to be written to stdout
        writeFileHandle has fileName if output json is to be written to file
        """
        if not writeFile:
            columnNames  = ['hits','firstpost_date','title','url','trackback_date','trackback_total','url_expansions','target_birth_date','content',\
        'mytype','score','topsy_author_img','trackback_permalink','trackback_author_url','highlight','topsy_author_url','topsy_trackback_url','trackback_author_name','trackback_author_nick']
            if self.csvHeadersWritten ==  False:
                #Write column names at the top of the csv
                print "\t".join(columnNames)
            
            #For now, simplify
            for tweet in jsonData.response['list']:
                line = ""
                for column in columnNames:
                    if type(tweet[column])==unicode:
                        #print "string"
                        if column == 'trackback_author_nick':
                            #This is the last column and so insert new line after it for the new tweet
                            print repr(tweet[column].encode('utf-8'))+"\t"
                        else:
                            print repr(tweet[column].encode('utf-8'))+"\t",
                    else:
                        #print "number"
                        print str(tweet[column])+"\t",
        else:
            #write json output to file
            myfp = open(folderName+"/"+"_".join(queryTags.lower().split())+"_"+ str(offset) +".json","w")
            json.dump(jsonRawData,  myfp, indent=4, sort_keys=False, separators=(',', ': '))
            myfp.close()
        
class ResultsProcessor:
    """
    This class will perform operations on json results received from Topsy Crawler class
    """
    resultsJsonDictionary = {}
    request = {}
    response = {}
    page = 0
    window = "" #specify if data is to be fetched from last day(d5), week(w), month(m) or all time(a)
    offset = 0
    hidden = 0
    total = 0
    last_offset = 0
    
    def __init__(self, resultsJson):
        self.resultsJsonDictionary = resultsJson #convert string into valid json format
        self.request = self.resultsJsonDictionary['request']
        self.response = self.resultsJsonDictionary['response']
        self.page = self.resultsJsonDictionary['response']['page']
        self.window = self.resultsJsonDictionary['response']['window']
        self.offset = self.resultsJsonDictionary['response']['offset']
        self.hidden = self.resultsJsonDictionary['response']['hidden']
        self.total = self.resultsJsonDictionary['response']['total']
        self.last_offset = self.resultsJsonDictionary['response']['last_offset']



In [1]:

    
if __name__ == "__main__":
    API_KEY = "09C43A9B270A470B8EB8F2946A9369F3"
    """
    #Example to examine tweets
    crawlerObj = TopsyCrawler(API_KEY)
    url = crawlerObj.queryBuilder(apikey=API_KEY, type='tweet', window='a', q='#happy #surprised')
    jsonResults = crawlerObj.crawlUrl(url)
    resultsObj = ResultsProcessor(jsonResults)
    """
    #example to fetch multiple tweets
    crawlerObj = TopsyCrawler(API_KEY)
    crawlerObj.fetchTweets(10, 5, True, "./", apikey=API_KEY, type='tweet', window='a', q='شوف')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-d5c6d597181f> in <module>()
      9     """
     10     #example to fetch multiple tweets
---> 11     crawlerObj = TopsyCrawler(API_KEY)
     12     crawlerObj.fetchTweets(2000, 5, True, "./", apikey=API_KEY, type='tweet', window='a', q='شوف')

NameError: name 'TopsyCrawler' is not defined



In [ ]:

    
test = json.load(open("hash_happy_hash_suprised.json","r"))



In [ ]:

    
test.keys()



In [93]:

    
crawlerObj = TopsyCrawler(API_KEY)
crawlerObj.fetchTweets(10, 5, True, ".", apikey=API_KEY, type='tweet', window='a', q='#happy #surprised')









    



http://otter.topsy.com/search.json?q=%23happy+%23surprised&window=a&apikey=09C43A9B270A470B8EB8F2946A9369F3&type=tweet
http://otter.topsy.com/search.json?q=%23happy+%23surprised&window=a&apikey=09C43A9B270A470B8EB8F2946A9369F3&type=tweet&offset=10



In [86]:

    
print "hello"









    



hello



In [1]:

    
folders = ["#happily happy",
"happily #happy",
"happily happy",

"#happily #sad",
"happily #sad",
"#happily sad",
"happily sad",

"#happily #angry",
"#happily angry",
"happily #angry",
"happily angry",

"#happily #fearful",
"happily #fearful",
"#happily fearful",
"happily fearful",

"#happily #surprised",
"happily #surprised",
"#happily surprised",
"happily surprised",

"#happily #disgusted",
"happily #disgusted",
"#happily disgusted",
"happily disgusted",

"#surprisingly #happy",
"surprisingly #happy", 
"#surprisingly happy", 
"surprisingly happy", 

"#surprisingly #sad",
"surprisingly #sad",
"#surprisingly sad",
"surprisingly sad",

"#surprisingly #angry",
"surprisingly angry",
"surprisingly #angry",
"surprisingly angry",

"#surprisingly #fearful",
"surprisingly #fearful",
"#surprisingly fearful",
"surprisingly fearful",

"#surprisingly #surprised",
"surprisingly #surprised",
"#surprisingly surprised",
"surprisingly surprised",

"#surprisingly #disgusted",
"surprisingly #disgusted",
"#surprisingly disgusted",
"surprisingly disgusted",

"#sadly #sad",
"sadly #sad",
"#sadly sad",
"sadly sad",

"#sadly #happy",
"sadly #happy",
"#sadly happy",
"sadly happy",

"#sadly #angry",
"sadly angry",
"sadly #angry",
"sadly angry",

"#sadly #fearful",
"sadly #fearful",
"#sadly fearful",
"sadly fearful",

"#sadly #surprised",
"sadly #surprised",
"#sadly surprised",
"sadly surprised",

"#sadly #disgusted",
"sadly #disgusted",
"#sadly disgusted",
"sadly disgusted",

"#angrily #angry",
"angrily angry",
"angrily #angry",
"angrily angry",

"#angrily #sad",
"angrily #sad",
"#angrily sad",
"angrily sad",

"#angrily #happy",
"angrily #happy",
"#angrily happy",
"angrily happy",

"#angrily #fearful",
"angrily #fearful",
"#angrily fearful",
"angrily fearful",

"#angrily #surprised",
"angrily #surprised",
"#angrily surprised",
"angrily surprised",

"#angrily #disgusted",
"angrily #disgusted",
"#angrily disgusted",
"angrily disgusted",

"#fearfully #fearful",
"fearfully #fearful",
"#fearfully fearful",
"fearfully fearful",

"#fearfully #angry",
"fearfully angry",
"fearfully #angry",
"fearfully angry",

"#fearfully #sad",
"fearfully #sad",
"#fearfully sad",
"fearfully sad",

"#fearfully #happy",
"fearfully #happy",
"#fearfully happy",
"fearfully happy",

"#fearfully #surprised",
"fearfully #surprised",
"#fearfully surprised",
"fearfully surprised",

"#fearfully #disgusted",
"fearfully #disgusted",
"#fearfully disgusted",
"fearfully disgusted",

"#disgusted #disgusted",
"disgusted #disgusted",
"#disgusted disgusted",
"disgusted disgusted",

"#disgusted #angry",
"disgusted angry",
"disgusted #angry",
"disgusted angry",

"#disgusted #sad",
"disgusted #sad",
"#disgusted sad",
"disgusted sad",

"#disgusted #happy",
"disgusted #happy",
"#disgusted happy",
"disgusted happy",

"#disgusted #fearful",
"disgusted #fearful",
"#disgusted fearful",
"disgusted fearful",

"#disgusted #surprised",
"disgusted #surprised",
"#disgusted surprised",
"disgusted surprised"]



In [13]:

    
for folder in folders:
    query = folder
    path = "'"+"_".join(folder.split())+"'"
    #print query
    print "nice python topsy-crawler.py 1000 10 True CrawledData/"+path.strip("'")+ " '"+query+"'"+ " > logs/console_log_"+path.strip("'")+".txt"
    #print path









    



nice python topsy-crawler.py 1000 10 True CrawledData/#happily_happy '#happily happy' > logs/console_log_#happily_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#happy 'happily #happy' > logs/console_log_happily_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_happy 'happily happy' > logs/console_log_happily_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_#sad '#happily #sad' > logs/console_log_#happily_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#sad 'happily #sad' > logs/console_log_happily_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_sad '#happily sad' > logs/console_log_#happily_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_sad 'happily sad' > logs/console_log_happily_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_#angry '#happily #angry' > logs/console_log_#happily_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_angry '#happily angry' > logs/console_log_#happily_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#angry 'happily #angry' > logs/console_log_happily_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_angry 'happily angry' > logs/console_log_happily_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_#fearful '#happily #fearful' > logs/console_log_#happily_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#fearful 'happily #fearful' > logs/console_log_happily_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_fearful '#happily fearful' > logs/console_log_#happily_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_fearful 'happily fearful' > logs/console_log_happily_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_#surprised '#happily #surprised' > logs/console_log_#happily_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#surprised 'happily #surprised' > logs/console_log_happily_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_surprised '#happily surprised' > logs/console_log_#happily_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_surprised 'happily surprised' > logs/console_log_happily_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_#disgusted '#happily #disgusted' > logs/console_log_#happily_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_#disgusted 'happily #disgusted' > logs/console_log_happily_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#happily_disgusted '#happily disgusted' > logs/console_log_#happily_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/happily_disgusted 'happily disgusted' > logs/console_log_happily_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#happy '#surprisingly #happy' > logs/console_log_#surprisingly_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#happy 'surprisingly #happy' > logs/console_log_surprisingly_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_happy '#surprisingly happy' > logs/console_log_#surprisingly_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_happy 'surprisingly happy' > logs/console_log_surprisingly_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#sad '#surprisingly #sad' > logs/console_log_#surprisingly_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#sad 'surprisingly #sad' > logs/console_log_surprisingly_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_sad '#surprisingly sad' > logs/console_log_#surprisingly_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_sad 'surprisingly sad' > logs/console_log_surprisingly_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#angry '#surprisingly #angry' > logs/console_log_#surprisingly_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_angry 'surprisingly angry' > logs/console_log_surprisingly_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#angry 'surprisingly #angry' > logs/console_log_surprisingly_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_angry 'surprisingly angry' > logs/console_log_surprisingly_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#fearful '#surprisingly #fearful' > logs/console_log_#surprisingly_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#fearful 'surprisingly #fearful' > logs/console_log_surprisingly_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_fearful '#surprisingly fearful' > logs/console_log_#surprisingly_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_fearful 'surprisingly fearful' > logs/console_log_surprisingly_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#surprised '#surprisingly #surprised' > logs/console_log_#surprisingly_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#surprised 'surprisingly #surprised' > logs/console_log_surprisingly_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_surprised '#surprisingly surprised' > logs/console_log_#surprisingly_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_surprised 'surprisingly surprised' > logs/console_log_surprisingly_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_#disgusted '#surprisingly #disgusted' > logs/console_log_#surprisingly_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_#disgusted 'surprisingly #disgusted' > logs/console_log_surprisingly_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#surprisingly_disgusted '#surprisingly disgusted' > logs/console_log_#surprisingly_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/surprisingly_disgusted 'surprisingly disgusted' > logs/console_log_surprisingly_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#sad '#sadly #sad' > logs/console_log_#sadly_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#sad 'sadly #sad' > logs/console_log_sadly_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_sad '#sadly sad' > logs/console_log_#sadly_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_sad 'sadly sad' > logs/console_log_sadly_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#happy '#sadly #happy' > logs/console_log_#sadly_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#happy 'sadly #happy' > logs/console_log_sadly_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_happy '#sadly happy' > logs/console_log_#sadly_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_happy 'sadly happy' > logs/console_log_sadly_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#angry '#sadly #angry' > logs/console_log_#sadly_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_angry 'sadly angry' > logs/console_log_sadly_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#angry 'sadly #angry' > logs/console_log_sadly_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_angry 'sadly angry' > logs/console_log_sadly_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#fearful '#sadly #fearful' > logs/console_log_#sadly_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#fearful 'sadly #fearful' > logs/console_log_sadly_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_fearful '#sadly fearful' > logs/console_log_#sadly_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_fearful 'sadly fearful' > logs/console_log_sadly_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#surprised '#sadly #surprised' > logs/console_log_#sadly_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#surprised 'sadly #surprised' > logs/console_log_sadly_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_surprised '#sadly surprised' > logs/console_log_#sadly_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_surprised 'sadly surprised' > logs/console_log_sadly_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_#disgusted '#sadly #disgusted' > logs/console_log_#sadly_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_#disgusted 'sadly #disgusted' > logs/console_log_sadly_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#sadly_disgusted '#sadly disgusted' > logs/console_log_#sadly_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/sadly_disgusted 'sadly disgusted' > logs/console_log_sadly_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#angry '#angrily #angry' > logs/console_log_#angrily_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_angry 'angrily angry' > logs/console_log_angrily_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#angry 'angrily #angry' > logs/console_log_angrily_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_angry 'angrily angry' > logs/console_log_angrily_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#sad '#angrily #sad' > logs/console_log_#angrily_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#sad 'angrily #sad' > logs/console_log_angrily_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_sad '#angrily sad' > logs/console_log_#angrily_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_sad 'angrily sad' > logs/console_log_angrily_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#happy '#angrily #happy' > logs/console_log_#angrily_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#happy 'angrily #happy' > logs/console_log_angrily_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_happy '#angrily happy' > logs/console_log_#angrily_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_happy 'angrily happy' > logs/console_log_angrily_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#fearful '#angrily #fearful' > logs/console_log_#angrily_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#fearful 'angrily #fearful' > logs/console_log_angrily_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_fearful '#angrily fearful' > logs/console_log_#angrily_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_fearful 'angrily fearful' > logs/console_log_angrily_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#surprised '#angrily #surprised' > logs/console_log_#angrily_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#surprised 'angrily #surprised' > logs/console_log_angrily_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_surprised '#angrily surprised' > logs/console_log_#angrily_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_surprised 'angrily surprised' > logs/console_log_angrily_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_#disgusted '#angrily #disgusted' > logs/console_log_#angrily_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_#disgusted 'angrily #disgusted' > logs/console_log_angrily_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#angrily_disgusted '#angrily disgusted' > logs/console_log_#angrily_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/angrily_disgusted 'angrily disgusted' > logs/console_log_angrily_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#fearful '#fearfully #fearful' > logs/console_log_#fearfully_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#fearful 'fearfully #fearful' > logs/console_log_fearfully_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_fearful '#fearfully fearful' > logs/console_log_#fearfully_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_fearful 'fearfully fearful' > logs/console_log_fearfully_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#angry '#fearfully #angry' > logs/console_log_#fearfully_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_angry 'fearfully angry' > logs/console_log_fearfully_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#angry 'fearfully #angry' > logs/console_log_fearfully_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_angry 'fearfully angry' > logs/console_log_fearfully_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#sad '#fearfully #sad' > logs/console_log_#fearfully_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#sad 'fearfully #sad' > logs/console_log_fearfully_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_sad '#fearfully sad' > logs/console_log_#fearfully_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_sad 'fearfully sad' > logs/console_log_fearfully_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#happy '#fearfully #happy' > logs/console_log_#fearfully_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#happy 'fearfully #happy' > logs/console_log_fearfully_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_happy '#fearfully happy' > logs/console_log_#fearfully_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_happy 'fearfully happy' > logs/console_log_fearfully_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#surprised '#fearfully #surprised' > logs/console_log_#fearfully_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#surprised 'fearfully #surprised' > logs/console_log_fearfully_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_surprised '#fearfully surprised' > logs/console_log_#fearfully_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_surprised 'fearfully surprised' > logs/console_log_fearfully_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_#disgusted '#fearfully #disgusted' > logs/console_log_#fearfully_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_#disgusted 'fearfully #disgusted' > logs/console_log_fearfully_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#fearfully_disgusted '#fearfully disgusted' > logs/console_log_#fearfully_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/fearfully_disgusted 'fearfully disgusted' > logs/console_log_fearfully_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#disgusted '#disgusted #disgusted' > logs/console_log_#disgusted_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#disgusted 'disgusted #disgusted' > logs/console_log_disgusted_#disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_disgusted '#disgusted disgusted' > logs/console_log_#disgusted_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_disgusted 'disgusted disgusted' > logs/console_log_disgusted_disgusted.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#angry '#disgusted #angry' > logs/console_log_#disgusted_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_angry 'disgusted angry' > logs/console_log_disgusted_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#angry 'disgusted #angry' > logs/console_log_disgusted_#angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_angry 'disgusted angry' > logs/console_log_disgusted_angry.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#sad '#disgusted #sad' > logs/console_log_#disgusted_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#sad 'disgusted #sad' > logs/console_log_disgusted_#sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_sad '#disgusted sad' > logs/console_log_#disgusted_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_sad 'disgusted sad' > logs/console_log_disgusted_sad.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#happy '#disgusted #happy' > logs/console_log_#disgusted_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#happy 'disgusted #happy' > logs/console_log_disgusted_#happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_happy '#disgusted happy' > logs/console_log_#disgusted_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_happy 'disgusted happy' > logs/console_log_disgusted_happy.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#fearful '#disgusted #fearful' > logs/console_log_#disgusted_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#fearful 'disgusted #fearful' > logs/console_log_disgusted_#fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_fearful '#disgusted fearful' > logs/console_log_#disgusted_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_fearful 'disgusted fearful' > logs/console_log_disgusted_fearful.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_#surprised '#disgusted #surprised' > logs/console_log_#disgusted_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_#surprised 'disgusted #surprised' > logs/console_log_disgusted_#surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/#disgusted_surprised '#disgusted surprised' > logs/console_log_#disgusted_surprised.txt
nice python topsy-crawler.py 1000 10 True CrawledData/disgusted_surprised 'disgusted surprised' > logs/console_log_disgusted_surprised.txt

Writing a tab separated data



In [9]:

    
#imports
import sys
import json
import os



In [10]:

    
#Read file from directory
#dirName = sys.argv[1]
dirName = "./CrawledData-29May-1week/"



In [11]:

    
#Read json contents
directories = os.listdir(dirName)
listOfDataRows = []
for directory in directories:
    #Fetch each json in the current directory:
    try:
        for jsonFile  in os.listdir(dirName.strip("/")+"/"+directory):
            fileName = dirName.strip("/")+"/"+directory+"/"+jsonFile
            jsonData = json.load(open(fileName,"r"))
            tabData = tabSeparatedData(jsonData)
            if tabData:
                listOfDataRows.extend(tabData)
        #Now write this data to file
        fh = open("./CrawledData-29May-1week-CSV/"+directory+".csv","w")
        fh.write("firstpost_date\turl\turl_expansion_data_url\turl_expansion_data_topsy_expanded_url\tcontent\ttrackback_author_nick\ttrackback_author_name\n")
        for row in listOfDataRows:
            #print type(row)#.encode("utf-8")
            fh.write(row.encode('utf-8'))
            fh.write("\n")
        fh.close()
    except:
        print "Error:",sys.exc_info()[1]
        continue









    



Error: [Errno 20] Not a directory: './CrawledData-29May-1week/createDirectoriesScript.sh'



In [5]:

    
#Make each json data into tab separated format
def tabSeparatedData(data):
    listOfDataRows = []
    try:
        for item in data['response']['list']:
            dataRow = str(item['firstpost_date'])
            dataRow = dataRow + "\t"+ item['url']
            #if url expansions exist:
            try:
                url_expansion_data_url = item['url_expansions']['url'].encode("utf-8")
            except:
                url_expansion_data_url = "Not Available"
            try:
                url_expansion_data_topsy_expanded_url = item['url_expansions']['topsy_expanded_url'].encode("utf-8")
            except:
                url_expansion_data_topsy_expanded_url = "Not Available"
            dataRow = dataRow + "\t" + url_expansion_data_url.strip("\t")
            dataRow = dataRow + "\t" + url_expansion_data_topsy_expanded_url.strip("\t")
            try:
                dataRow = dataRow + "\t" + repr(item['content']).encode("utf-8")
            except:
                dataRow = dataRow + "\t" + repr(item['content'])
            try:
                dataRow = dataRow + "\t" + item['trackback_author_nick'].encode("utf-8")
            except:
                dataRow = dataRow + "\t" + item['trackback_author_nick']
            try:
                dataRow = dataRow + "\t" + item['trackback_author_name'].encode("utf-8")
            except:
                dataRow = dataRow + "\t" + item['trackback_author_name']
            listOfDataRows.append(dataRow)
    except:
        listOfDataRows = None
    return listOfDataRows



In [ ]: