In [3]:
# import pyspark
# from pyspark import SparkConf
# from pyspark import SparkContext
# sc = SparkContext(appName="Jon's PySpark")
sc
In [5]:
from utils import header
print(header.create_header(sc))
In [2]:
ob_rdds = []
for month in range(1, 13):
dtg = "1970{:02}".format(month)
obs = sc.textFile("/user/schiefjm/weather/gsod/1970")\
.filter(lambda line: "STN" not in line)\
.filter(lambda line: "034700" in line)\
.filter(lambda line: dtg in line)
ob_rdds.append([dtg, obs.take(obs.count())])
In [51]:
def get_stations(year):
""" Given a year from 1929 to 2009 inclusive returns a set of the
stations in the data set for that year.
:input:
year: a string or integer between 1929 and 2009 inclusive
:output:
returns a list of tuples of the station id and number of obs for
said station
"""
# validate the year
year = int(year)
if 1929 <= year <= 2009:
year = str(year)
else:
print("Please enter a valid year from 1929 to 2009.")
return 0
# get the stations and number of obs for each
return sc.textFile("/user/schiefjm/weather/gsod/" + year)\
.filter(lambda line: "STN" not in line)\
.map(lambda line: (line.split()[0], 1))\
.reduceByKey(lambda x, y: x + y)\
.collect()
for year in range(1929, 1932):
stations = get_stations(year)
total_obs = 0
for station in stations:
total_obs += int(station[1])
print("\n{} has {} stations and {} total observations"\
.format(year, len(stations), total_obs))
print("-" * 70)
for station in stations[:5]:
print("{}\t{}".format(station[0], station[1]))
In [ ]:
counts = file.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
In [1]:
import this
In [2]:
from __future__ import braces
In [ ]: