In [92]:
import fileinput
import json
import csv
import sys
import urllib
import urlparse
import time
class TopsyCrawler:
"""
Crawl Topsy
"""
API_KEY = ""
base_url = "http://otter.topsy.com/search.json?q="
base_params = {}
csvHeadersWritten = False
def __init__(self, api_key):
self.API_KEY = api_key
def queryBuilder(self, **kwargs):
#Reference: http://stackoverflow.com/a/2506477/2762836
"""
Builds complete query string to be used to query Topsy API
Parameters:
Window defines the time frame which is to be searched for tweets
window=h (last 1 hour)
window=d (last 1 day)
window=d5 (last 5 days)
window=w (last 7 days)
window=m (last month)
window=a (all time)
"""
self.base_params = kwargs
url_parts = list(urlparse.urlparse(self.base_url))
query = dict(urlparse.parse_qsl(url_parts[4]))
query.update(self.base_params)
url_parts[4] = urllib.urlencode(query)
initial_url = urlparse.urlunparse(url_parts)
print initial_url
sys.stdout.flush()
return initial_url
def crawlUrl(self, url):
"""
gets results from querying the url
"""
return json.loads(urllib.urlopen(url).readlines()[0]) #return json output
def fetchTweets(self, maxTweetNumber, delayPerRequest, writeFileHandle, folderName, **kwargs):
"""
fetches tweets until the number of tweets fetched exceeds 'maxTweetNumber'
'delayPerRequest' is the time in seconds to wait before making next request.
"""
#Build first query
url = self.queryBuilder(**kwargs)
#First first page of results
resultObj = {}
resultObj = self.crawlUrl(url)
processedResult = ResultsProcessor(resultObj)
self.writeJsonToCsv(processedResult, resultObj, writeFileHandle, 0, kwargs['q'], folderName)
self.csvHeadersWritten = True
offset = processedResult.offset
nextOffset = offset+10
noOfTweetsFetched = len(processedResult.response['list'])
while True:
#Check if condition to exit the loop is met
if noOfTweetsFetched > maxTweetNumber:
break
if len(processedResult.response['list']) == 0:
break
#Wait for sometime before next request
time.sleep(delayPerRequest)
#Query the url
url = self.queryBuilder(apikey=self.base_params['apikey'], type=self.base_params['type'], window=self.base_params['window'], q=self.base_params['q'], offset=nextOffset)
resultObj = self.crawlUrl(url)
processedResult = ResultsProcessor(resultObj)
self.writeJsonToCsv(processedResult, resultObj, writeFileHandle, nextOffset, kwargs['q'], folderName)
#Book keeping for processing next result
nextOffset = nextOffset+10
noOfTweetsFetched = len(processedResult.response['list']) + noOfTweetsFetched
def writeJsonToCsv(self, jsonData, jsonRawData, writeFile, offset, queryTags, folderName):
"""
Used to write tweets data to csv file
writeFileHandle is False if output is to be written to stdout
writeFileHandle has fileName if output json is to be written to file
"""
if not writeFile:
columnNames = ['hits','firstpost_date','title','url','trackback_date','trackback_total','url_expansions','target_birth_date','content',\
'mytype','score','topsy_author_img','trackback_permalink','trackback_author_url','highlight','topsy_author_url','topsy_trackback_url','trackback_author_name','trackback_author_nick']
if self.csvHeadersWritten == False:
#Write column names at the top of the csv
print "\t".join(columnNames)
#For now, simplify
for tweet in jsonData.response['list']:
line = ""
for column in columnNames:
if type(tweet[column])==unicode:
#print "string"
if column == 'trackback_author_nick':
#This is the last column and so insert new line after it for the new tweet
print repr(tweet[column].encode('utf-8'))+"\t"
else:
print repr(tweet[column].encode('utf-8'))+"\t",
else:
#print "number"
print str(tweet[column])+"\t",
else:
#write json output to file
myfp = open(folderName+"/"+"_".join(queryTags.lower().split())+"_"+ str(offset) +".json","w")
json.dump(jsonRawData, myfp, indent=4, sort_keys=False, separators=(',', ': '))
myfp.close()
class ResultsProcessor:
"""
This class will perform operations on json results received from Topsy Crawler class
"""
resultsJsonDictionary = {}
request = {}
response = {}
page = 0
window = "" #specify if data is to be fetched from last day(d5), week(w), month(m) or all time(a)
offset = 0
hidden = 0
total = 0
last_offset = 0
def __init__(self, resultsJson):
self.resultsJsonDictionary = resultsJson #convert string into valid json format
self.request = self.resultsJsonDictionary['request']
self.response = self.resultsJsonDictionary['response']
self.page = self.resultsJsonDictionary['response']['page']
self.window = self.resultsJsonDictionary['response']['window']
self.offset = self.resultsJsonDictionary['response']['offset']
self.hidden = self.resultsJsonDictionary['response']['hidden']
self.total = self.resultsJsonDictionary['response']['total']
self.last_offset = self.resultsJsonDictionary['response']['last_offset']
In [1]:
if __name__ == "__main__":
API_KEY = "09C43A9B270A470B8EB8F2946A9369F3"
"""
#Example to examine tweets
crawlerObj = TopsyCrawler(API_KEY)
url = crawlerObj.queryBuilder(apikey=API_KEY, type='tweet', window='a', q='#happy #surprised')
jsonResults = crawlerObj.crawlUrl(url)
resultsObj = ResultsProcessor(jsonResults)
"""
#example to fetch multiple tweets
crawlerObj = TopsyCrawler(API_KEY)
crawlerObj.fetchTweets(10, 5, True, "./", apikey=API_KEY, type='tweet', window='a', q='شوف')
In [ ]:
test = json.load(open("hash_happy_hash_suprised.json","r"))
In [ ]:
test.keys()
In [93]:
crawlerObj = TopsyCrawler(API_KEY)
crawlerObj.fetchTweets(10, 5, True, ".", apikey=API_KEY, type='tweet', window='a', q='#happy #surprised')
In [86]:
print "hello"
In [1]:
folders = ["#happily happy",
"happily #happy",
"happily happy",
"#happily #sad",
"happily #sad",
"#happily sad",
"happily sad",
"#happily #angry",
"#happily angry",
"happily #angry",
"happily angry",
"#happily #fearful",
"happily #fearful",
"#happily fearful",
"happily fearful",
"#happily #surprised",
"happily #surprised",
"#happily surprised",
"happily surprised",
"#happily #disgusted",
"happily #disgusted",
"#happily disgusted",
"happily disgusted",
"#surprisingly #happy",
"surprisingly #happy",
"#surprisingly happy",
"surprisingly happy",
"#surprisingly #sad",
"surprisingly #sad",
"#surprisingly sad",
"surprisingly sad",
"#surprisingly #angry",
"surprisingly angry",
"surprisingly #angry",
"surprisingly angry",
"#surprisingly #fearful",
"surprisingly #fearful",
"#surprisingly fearful",
"surprisingly fearful",
"#surprisingly #surprised",
"surprisingly #surprised",
"#surprisingly surprised",
"surprisingly surprised",
"#surprisingly #disgusted",
"surprisingly #disgusted",
"#surprisingly disgusted",
"surprisingly disgusted",
"#sadly #sad",
"sadly #sad",
"#sadly sad",
"sadly sad",
"#sadly #happy",
"sadly #happy",
"#sadly happy",
"sadly happy",
"#sadly #angry",
"sadly angry",
"sadly #angry",
"sadly angry",
"#sadly #fearful",
"sadly #fearful",
"#sadly fearful",
"sadly fearful",
"#sadly #surprised",
"sadly #surprised",
"#sadly surprised",
"sadly surprised",
"#sadly #disgusted",
"sadly #disgusted",
"#sadly disgusted",
"sadly disgusted",
"#angrily #angry",
"angrily angry",
"angrily #angry",
"angrily angry",
"#angrily #sad",
"angrily #sad",
"#angrily sad",
"angrily sad",
"#angrily #happy",
"angrily #happy",
"#angrily happy",
"angrily happy",
"#angrily #fearful",
"angrily #fearful",
"#angrily fearful",
"angrily fearful",
"#angrily #surprised",
"angrily #surprised",
"#angrily surprised",
"angrily surprised",
"#angrily #disgusted",
"angrily #disgusted",
"#angrily disgusted",
"angrily disgusted",
"#fearfully #fearful",
"fearfully #fearful",
"#fearfully fearful",
"fearfully fearful",
"#fearfully #angry",
"fearfully angry",
"fearfully #angry",
"fearfully angry",
"#fearfully #sad",
"fearfully #sad",
"#fearfully sad",
"fearfully sad",
"#fearfully #happy",
"fearfully #happy",
"#fearfully happy",
"fearfully happy",
"#fearfully #surprised",
"fearfully #surprised",
"#fearfully surprised",
"fearfully surprised",
"#fearfully #disgusted",
"fearfully #disgusted",
"#fearfully disgusted",
"fearfully disgusted",
"#disgusted #disgusted",
"disgusted #disgusted",
"#disgusted disgusted",
"disgusted disgusted",
"#disgusted #angry",
"disgusted angry",
"disgusted #angry",
"disgusted angry",
"#disgusted #sad",
"disgusted #sad",
"#disgusted sad",
"disgusted sad",
"#disgusted #happy",
"disgusted #happy",
"#disgusted happy",
"disgusted happy",
"#disgusted #fearful",
"disgusted #fearful",
"#disgusted fearful",
"disgusted fearful",
"#disgusted #surprised",
"disgusted #surprised",
"#disgusted surprised",
"disgusted surprised"]
In [13]:
for folder in folders:
query = folder
path = "'"+"_".join(folder.split())+"'"
#print query
print "nice python topsy-crawler.py 1000 10 True CrawledData/"+path.strip("'")+ " '"+query+"'"+ " > logs/console_log_"+path.strip("'")+".txt"
#print path
In [9]:
#imports
import sys
import json
import os
In [10]:
#Read file from directory
#dirName = sys.argv[1]
dirName = "./CrawledData-29May-1week/"
In [11]:
#Read json contents
directories = os.listdir(dirName)
listOfDataRows = []
for directory in directories:
#Fetch each json in the current directory:
try:
for jsonFile in os.listdir(dirName.strip("/")+"/"+directory):
fileName = dirName.strip("/")+"/"+directory+"/"+jsonFile
jsonData = json.load(open(fileName,"r"))
tabData = tabSeparatedData(jsonData)
if tabData:
listOfDataRows.extend(tabData)
#Now write this data to file
fh = open("./CrawledData-29May-1week-CSV/"+directory+".csv","w")
fh.write("firstpost_date\turl\turl_expansion_data_url\turl_expansion_data_topsy_expanded_url\tcontent\ttrackback_author_nick\ttrackback_author_name\n")
for row in listOfDataRows:
#print type(row)#.encode("utf-8")
fh.write(row.encode('utf-8'))
fh.write("\n")
fh.close()
except:
print "Error:",sys.exc_info()[1]
continue
In [5]:
#Make each json data into tab separated format
def tabSeparatedData(data):
listOfDataRows = []
try:
for item in data['response']['list']:
dataRow = str(item['firstpost_date'])
dataRow = dataRow + "\t"+ item['url']
#if url expansions exist:
try:
url_expansion_data_url = item['url_expansions']['url'].encode("utf-8")
except:
url_expansion_data_url = "Not Available"
try:
url_expansion_data_topsy_expanded_url = item['url_expansions']['topsy_expanded_url'].encode("utf-8")
except:
url_expansion_data_topsy_expanded_url = "Not Available"
dataRow = dataRow + "\t" + url_expansion_data_url.strip("\t")
dataRow = dataRow + "\t" + url_expansion_data_topsy_expanded_url.strip("\t")
try:
dataRow = dataRow + "\t" + repr(item['content']).encode("utf-8")
except:
dataRow = dataRow + "\t" + repr(item['content'])
try:
dataRow = dataRow + "\t" + item['trackback_author_nick'].encode("utf-8")
except:
dataRow = dataRow + "\t" + item['trackback_author_nick']
try:
dataRow = dataRow + "\t" + item['trackback_author_name'].encode("utf-8")
except:
dataRow = dataRow + "\t" + item['trackback_author_name']
listOfDataRows.append(dataRow)
except:
listOfDataRows = None
return listOfDataRows
In [ ]: