In [4]:
from mrjob.job import MRJob
class MRSortByString(MRJob):
def mapper(self, _, line):
"""
"""
l = line.split(' ')
yield l[1], l[0]
def reducer(self, key, val):
yield key, [v for v in val][0]
if __name__ == '__main__':
print " stored in MRSortByString.py"
#MRSortByString.run()
sortdata.txt
1 1 2 4 3 8 4 2 4 7 5 5 6 10 7 11
In [1]:
# -*- coding: utf-8 -*-
# Testing word frequency count
from MRSortByString import *
from mrjob.job import MRJob
'''
This is a simple wrapper that runs mrjob MapReduce jobs, the inputs are:
MRJobClass - the class of the job to be run
argsArr - an array of strings to be used when creating the MRJob.
@author: Peter Harrington if you have any questions: peter.b.harrington@gmail.com
'''
def runJob(MRJobClass, argsArr, loc='local'):
if loc == 'emr':
argsArr.extend(['-r', 'emr'])
print "starting %s job on %s" % (MRJobClass.__name__, loc)
mrJob = MRJobClass(args=argsArr)
runner = mrJob.make_runner()
runner.run()
print "finished %s job" % MRJobClass.__name__
return mrJob, runner
def runParallelJob(MRJobClass, argsArr): #TO DO: add threading to allow jobs to run in
pass #parallel
#launch a new thread
#call runJob(MRJobClass, argsArr) on the new thread
if __name__ == '__main__':
# pass in file from outside
# MRWordFrequencyCount.run()
#setup file here
mr_job, runner = runJob(MRSortByString,["C:\data\sortdata.txt"],"local")
print "Sorting sortdata.txt"
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
print "%s: %s "%(key,value)
Note the second column is reported by their string values