In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)
In [2]:
def parseLine(line):
fields = line.split(',')
stationID = fields[0]
entryType = fields[2]
temperature = float(fields[3])*0.1
return (stationID, entryType, temperature)
In [5]:
lines = sc.textFile("file:///opt/ipython/work/example2/1800.csv")
parsedLines = lines.map(parseLine)
In [6]:
parsedLines.take(5)
Out[6]:
In [7]:
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
minTemps.take(5)
Out[7]:
In [8]:
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(5)
Out[8]:
In [9]:
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
results = minTemps.collect();
In [10]:
for result in results:
print(result[0] + "\t{:.2f}C".format(result[1]))
In [ ]: