In [1]:
# set up environment
import numpy as np
import pandas as pd
Your task is as follows:
Please see the test function for the expected return format
In [2]:
# read data from local file system
data = pd.read_excel("2013_ERCOT_Hourly_Load_Data.xls")
data.head()
Out[2]:
In [3]:
data.dtypes
Out[3]:
In [4]:
data["COAST"].describe()
Out[4]:
In [5]:
print(data["COAST"].max(), data["COAST"].min(), np.mean(data["COAST"]))
In [6]:
coast_max = data[["Hour_End", "COAST"]].ix[data["COAST"] == np.max(data["COAST"])]
coast_max
Out[6]:
In [7]:
coast_min = data[["Hour_End", "COAST"]].ix[data["COAST"] == np.min(data["COAST"])]
coast_min
Out[7]:
In [8]:
coast_max.values
Out[8]:
In [9]:
coast_min.values
Out[9]:
In [10]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests
BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"
# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = { "simple": {},
"atr": {"inc": "aliases+tags+ratings"},
"aliases": {"inc": "aliases"},
"releases": {"inc": "releases"}}
def query_site(url, params, uid="", fmt="json"):
# This is the main function for making queries to the musicbrainz API.
# A json document should be returned by the query.
params["fmt"] = fmt
r = requests.get(url + uid, params=params)
print("requesting", r.url)
if r.status_code == requests.codes.ok:
return r.json()
else:
r.raise_for_status()
def query_by_name(url, params, name):
# This adds an artist name to the query parameters before making
# an API call to the function above.
params["query"] = "artist:" + name
return query_site(url, params)
def pretty_print(data, indent=4):
# After we get our output, we can format it to be more readable
# by using this function.
if type(data) == dict:
print(json.dumps(data, indent=indent, sort_keys=True))
else:
print(data)
def get_info(band):
'''
Modify the function calls and indexing below to answer the questions on
the next quiz. HINT: Note how the output we get from the site is a
multi-level JSON document, so try making print statements to step through
the structure one level at a time or copy the output to a separate output
file.
'''
results = query_by_name(ARTIST_URL, query_type["simple"], band)
pretty_print(results)
artist_id = results["artists"][1]["id"]
print("\nARTIST:")
pretty_print(results["artists"][1])
artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
releases = artist_data["releases"]
print("\nONE RELEASE:")
pretty_print(releases[0], indent=2)
release_titles = [r["title"] for r in releases]
print("\nALL TITLES:")
for t in release_titles:
print(t)
Your task is to process the supplied file and use the csv module to extract data from it. The data comes from NREL (National Renewable Energy Laboratory) website. Each file contains information from one meteorological station, in particular - about amount of solar and wind energy for each hour of day.
Note that the first line of the datafile is neither data entry, nor header. It is a line describing the data source. You should extract the name of the station from it.
The data should be returned as a list of lists (not dictionaries). You can use the csv modules "reader" method to get data in such format. Another useful method is next() - to get the next line from the iterator. You should only change the parse_file function.
In [116]:
data2 = pd.read_csv("745090.csv", header=1, parse_dates=[[0,1]])
data2.head()
Out[116]:
In [12]:
data2.dtypes
Out[12]:
In [13]:
data2.rename(index=str, columns={"Date (MM/DD/YYYY)_Time (HH:MM)": "Date"}, inplace=True)
In [14]:
data3 = pd.read_excel("2013_ERCOT_Hourly_Load_Data.xls")
data3.head()
Out[14]:
In [58]:
data3.apply(np.max)
Out[58]:
In [60]:
data3[data3.columns[1:]].idxmax
Out[60]:
In [90]:
data3["COAST"].max()
Out[90]:
In [89]:
data3["COAST"].idxmax()
Out[89]:
In [107]:
data3["Hour_End"].iloc[5391]
Out[107]:
In [108]:
def get_maxload_per_station(data):
"""
Retrieve the maximum load per station and the corresponding timestamp
"""
# create empty list
result = []
# loop over columns
for column in data.columns[1:]:
# get max value and timestamp
max_value = data[column].max()
max_pos = data[column].idxmax()
timestamp = data["Hour_End"].iloc[max_pos]
# add values to list
result.append([column, timestamp.year, timestamp.month, timestamp.day, timestamp.hour, max_value])
# return result
return pd.DataFrame(result, columns=["station", "year", "month", "day", "hour", "load"])
In [112]:
output = get_maxload_per_station(data3)
output.head()
Out[112]:
In [115]:
# write file to local file system
#output.to_csv("output.csv", sep="|")
This exercise shows some important concepts that you should be aware about:
To run this code locally you have to register at the NYTimes developer site and get your own API key. You will be able to complete this exercise in our UI without doing so, as we have provided a sample result.
Your task is to process the saved file that represents the most popular articles (by view count) from the last day, and return the following data:
All your changes should be in the article_overview function. The rest of functions are provided for your convenience, if you want to access the API by yourself.
In [ ]:
import json
import codecs
import requests
URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
"article": ""}
def get_from_file(kind, period):
filename = "popular-{0}-{1}.json".format(kind, period)
with open(filename, "r") as f:
return json.loads(f.read())
def article_overview(kind, period):
data = get_from_file(kind, period)
titles = []
urls =[]
for article in data:
section = article["section"]
title = article["title"]
titles.append({section: title})
if "media" in article:
for m in article["media"]:
for mm in m["media-metadata"]:
if mm["format"] == "Standard Thumbnail":
urls.append(mm["url"])
return (titles, urls)
def query_site(url, target, offset):
# This will set up the query with the API key and offset
# Web services often use offset paramter to return data in small chunks
# NYTimes returns 20 articles per request, if you want the next 20
# You have to provide the offset parameter
if API_KEY["popular"] == "" or API_KEY["article"] == "":
print "You need to register for NYTimes Developer account to run this program."
print "See Intructor notes for information"
return False
params = {"api-key": API_KEY[target], "offset": offset}
r = requests.get(url, params = params)
if r.status_code == requests.codes.ok:
return r.json()
else:
r.raise_for_status()
def get_popular(url, kind, days, section="all-sections", offset=0):
# This function will construct the query according to the requirements of the site
# and return the data, or print an error message if called incorrectly
if days not in [1,7,30]:
print "Time period can be 1,7, 30 days only"
return False
if kind not in ["viewed", "shared", "emailed"]:
print "kind can be only one of viewed/shared/emailed"
return False
url += "most{0}/{1}/{2}.json".format(kind, section, days)
data = query_site(url, "popular", offset)
return data
def save_file(kind, period):
# This will process all results, by calling the API repeatedly with supplied offset value,
# combine the data and then write all results in a file.
data = get_popular(URL_POPULAR, "viewed", 1)
num_results = data["num_results"]
full_data = []
with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
for offset in range(0, num_results, 20):
data = get_popular(URL_POPULAR, kind, period, offset=offset)
full_data += data["results"]
v.write(json.dumps(full_data, indent=2))
def test():
titles, urls = article_overview("viewed", 1)
assert len(titles) == 20
assert len(urls) == 30
assert titles[2] == {'Opinion': 'Professors, We Need You!'}
assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'