Lesson 1

Data Wrangling



In [1]:

    
# set up environment
import numpy as np
import pandas as pd

Quizzes

Your task is as follows:

read the provided Excel file
find and return the min, max and average values for the COAST region
find and return the time value for the min and max entries
the time values should be returned as Python tuples

Please see the test function for the expected return format



In [2]:

    
# read data from local file system
data = pd.read_excel("2013_ERCOT_Hourly_Load_Data.xls")
data.head()









    Out[2]:






  
    
      
      Hour_End
      COAST
      EAST
      FAR_WEST
      NORTH
      NORTH_C
      SOUTHERN
      SOUTH_C
      WEST
      ERCOT
    
  
  
    
      0
      2013-01-01 01:00:00
      7606.263544
      1073.892438
      1411.750567
      784.978166
      10369.094390
      2206.675077
      4368.490945
      882.931901
      28704.077028
    
    
      1
      2013-01-01 02:00:00
      7388.082714
      1035.021938
      1403.472287
      776.307387
      10152.358518
      2159.733208
      4233.587967
      872.404750
      28020.968769
    
    
      2
      2013-01-01 03:00:00
      7178.867878
      1036.088697
      1395.053150
      768.125748
      9988.051418
      2065.114706
      4082.862860
      868.853938
      27383.018395
    
    
      3
      2013-01-01 04:00:00
      7038.822581
      1032.648841
      1395.508820
      770.937969
      9946.658655
      1990.903699
      4010.489608
      865.701201
      27051.671374
    
    
      4
      2013-01-01 05:00:00
      6990.857940
      1042.823044
      1401.216842
      779.089313
      10096.664190
      1954.807585
      4038.655997
      879.924249
      27184.039160



In [3]:

    
data.dtypes









    Out[3]:





Hour_End    datetime64[ns]
COAST              float64
EAST               float64
FAR_WEST           float64
NORTH              float64
NORTH_C            float64
SOUTHERN           float64
SOUTH_C            float64
WEST               float64
ERCOT              float64
dtype: object



In [4]:

    
data["COAST"].describe()









    Out[4]:





count     7295.000000
mean     10976.933461
std       2641.539915
min       6602.113899
25%       9125.688014
50%      10265.538541
75%      12586.700769
max      18779.025510
Name: COAST, dtype: float64



In [5]:

    
print(data["COAST"].max(), data["COAST"].min(), np.mean(data["COAST"]))









    



18779.02551 6602.113899 10976.933460679751



In [6]:

    
coast_max = data[["Hour_End", "COAST"]].ix[data["COAST"] == np.max(data["COAST"])]
coast_max









    Out[6]:






  
    
      
      Hour_End
      COAST
    
  
  
    
      5391
      2013-08-13 17:00:00
      18779.02551



In [7]:

    
coast_min = data[["Hour_End", "COAST"]].ix[data["COAST"] == np.min(data["COAST"])]
coast_min









    Out[7]:






  
    
      
      Hour_End
      COAST
    
  
  
    
      795
      2013-02-03 04:00:00
      6602.113899



In [8]:

    
coast_max.values









    Out[8]:





array([[Timestamp('2013-08-13 17:00:00'), 18779.025510000003]], dtype=object)



In [9]:

    
coast_min.values









    Out[9]:





array([[Timestamp('2013-02-03 04:00:00'), 6602.113898999982]], dtype=object)

Playing around with web services and requests



In [10]:

    
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print("requesting", r.url)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print(json.dumps(data, indent=indent, sort_keys=True))
    else:
        print(data)


def get_info(band):
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], band)
    pretty_print(results)

    artist_id = results["artists"][1]["id"]
    print("\nARTIST:")
    pretty_print(results["artists"][1])

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print("\nONE RELEASE:")
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print("\nALL TITLES:")
    for t in release_titles:
        print(t)

Problem sets

Handle CSV files

Your task is to process the supplied file and use the csv module to extract data from it. The data comes from NREL (National Renewable Energy Laboratory) website. Each file contains information from one meteorological station, in particular - about amount of solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries). You can use the csv modules "reader" method to get data in such format. Another useful method is next() - to get the next line from the iterator. You should only change the parse_file function.



In [116]:

    
data2 = pd.read_csv("745090.csv", header=1, parse_dates=[[0,1]])
data2.head()









    Out[116]:






  
    
      
      Date (MM/DD/YYYY)_Time (HH:MM)
      ETR (W/m^2)
      ETRN (W/m^2)
      GHI (W/m^2)
      GHI source
      GHI uncert (%)
      DNI (W/m^2)
      DNI source
      DNI uncert (%)
      DHI (W/m^2)
      ...
      AOD (unitless)
      AOD source
      AOD uncert (code)
      Alb (unitless)
      Alb source
      Alb uncert (code)
      Lprecip depth (mm)
      Lprecip quantity (hr)
      Lprecip source
      Lprecip uncert (code)
    
  
  
    
      0
      01/01/2005 01:00
      0
      0
      0
      2
      0
      0
      2
      0
      0
      ...
      0.099
      F
      8
      0.16
      F
      8
      0
      1
      A
      7
    
    
      1
      01/01/2005 02:00
      0
      0
      0
      2
      0
      0
      2
      0
      0
      ...
      0.099
      F
      8
      0.16
      F
      8
      0
      1
      A
      7
    
    
      2
      01/01/2005 03:00
      0
      0
      0
      2
      0
      0
      2
      0
      0
      ...
      0.099
      F
      8
      0.16
      F
      8
      0
      1
      A
      7
    
    
      3
      01/01/2005 04:00
      0
      0
      0
      2
      0
      0
      2
      0
      0
      ...
      0.099
      F
      8
      0.16
      F
      8
      0
      1
      A
      7
    
    
      4
      01/01/2005 05:00
      0
      0
      0
      2
      0
      0
      2
      0
      0
      ...
      0.099
      F
      8
      0.16
      F
      8
      0
      1
      A
      7
    
  

5 rows × 67 columns



In [12]:

    
data2.dtypes









    Out[12]:





Date (MM/DD/YYYY)_Time (HH:MM)     object
ETR (W/m^2)                         int64
ETRN (W/m^2)                        int64
GHI (W/m^2)                         int64
GHI source                          int64
GHI uncert (%)                      int64
DNI (W/m^2)                         int64
DNI source                          int64
DNI uncert (%)                      int64
DHI (W/m^2)                         int64
DHI source                          int64
DHI uncert (%)                      int64
GH illum (lx)                       int64
GH illum source                     int64
Global illum uncert (%)             int64
DN illum (lx)                       int64
DN illum source                     int64
DN illum uncert (%)                 int64
DH illum (lx)                       int64
DH illum source                     int64
DH illum uncert (%)                 int64
Zenith lum (cd/m^2)                 int64
Zenith lum source                   int64
Zenith lum uncert (%)               int64
TotCld (tenths)                     int64
TotCld source                      object
TotCld uncert (code)                int64
OpqCld (tenths)                     int64
OpqCld source                      object
OpqCld uncert (code)                int64
                                   ...   
RHum source                        object
RHum uncert (code)                  int64
Pressure (mbar)                     int64
Pressure source                    object
Pressure uncert (code)              int64
Wdir (degrees)                      int64
Wdir source                        object
Wdir uncert (code)                  int64
Wspd (m/s)                        float64
Wspd source                        object
Wspd uncert (code)                  int64
Hvis (m)                            int64
Hvis source                        object
Hvis uncert (code)                  int64
CeilHgt (m)                         int64
CeilHgt source                     object
CeilHgt uncert (code)               int64
Pwat (cm)                         float64
Pwat source                        object
Pwat uncert (code)                  int64
AOD (unitless)                    float64
AOD source                         object
AOD uncert (code)                   int64
Alb (unitless)                    float64
Alb source                         object
Alb uncert (code)                   int64
Lprecip depth (mm)                  int64
Lprecip quantity (hr)               int64
Lprecip source                     object
Lprecip uncert (code)               int64
dtype: object



In [13]:

    
data2.rename(index=str, columns={"Date (MM/DD/YYYY)_Time (HH:MM)": "Date"}, inplace=True)

Handle Excel files

Find the time and value of max load for each of the regions COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.



In [14]:

    
data3 = pd.read_excel("2013_ERCOT_Hourly_Load_Data.xls")
data3.head()









    Out[14]:






  
    
      
      Hour_End
      COAST
      EAST
      FAR_WEST
      NORTH
      NORTH_C
      SOUTHERN
      SOUTH_C
      WEST
      ERCOT
    
  
  
    
      0
      2013-01-01 01:00:00
      7606.263544
      1073.892438
      1411.750567
      784.978166
      10369.094390
      2206.675077
      4368.490945
      882.931901
      28704.077028
    
    
      1
      2013-01-01 02:00:00
      7388.082714
      1035.021938
      1403.472287
      776.307387
      10152.358518
      2159.733208
      4233.587967
      872.404750
      28020.968769
    
    
      2
      2013-01-01 03:00:00
      7178.867878
      1036.088697
      1395.053150
      768.125748
      9988.051418
      2065.114706
      4082.862860
      868.853938
      27383.018395
    
    
      3
      2013-01-01 04:00:00
      7038.822581
      1032.648841
      1395.508820
      770.937969
      9946.658655
      1990.903699
      4010.489608
      865.701201
      27051.671374
    
    
      4
      2013-01-01 05:00:00
      6990.857940
      1042.823044
      1401.216842
      779.089313
      10096.664190
      1954.807585
      4038.655997
      879.924249
      27184.039160



In [58]:

    
data3.apply(np.max)









    Out[58]:





Hour_End    2013-11-01 00:00:00
COAST                     18779
EAST                    2380.17
FAR_WEST                2281.27
NORTH                   1544.77
NORTH_C                 24415.6
SOUTHERN                5494.16
SOUTH_C                 11433.3
WEST                    1862.61
ERCOT                   67595.8
dtype: object



In [60]:

    
data3[data3.columns[1:]].idxmax









    Out[60]:





COAST       5391
EAST        5199
FAR_WEST    4239
NORTH       5247
NORTH_C     5248
SOUTHERN    5270
SOUTH_C     5272
WEST        5247
ERCOT       5247
dtype: int64



In [90]:

    
data3["COAST"].max()









    Out[90]:





18779.025510000003



In [89]:

    
data3["COAST"].idxmax()









    Out[89]:





5391



In [107]:

    
data3["Hour_End"].iloc[5391]









    Out[107]:





Timestamp('2013-08-13 17:00:00')



In [108]:

    
def get_maxload_per_station(data):
    """
    Retrieve the maximum load per station and the corresponding timestamp
    """
    # create empty list
    result = []
    
    # loop over columns
    for column in data.columns[1:]:
        
        # get max value and timestamp
        max_value = data[column].max()
        max_pos = data[column].idxmax()
        timestamp = data["Hour_End"].iloc[max_pos]
        
        # add values to list
        result.append([column, timestamp.year, timestamp.month, timestamp.day, timestamp.hour, max_value])
    
    # return result
    return pd.DataFrame(result, columns=["station", "year", "month", "day", "hour", "load"])



In [112]:

    
output = get_maxload_per_station(data3)
output.head()









    Out[112]:






  
    
      
      station
      year
      month
      day
      hour
      load
    
  
  
    
      0
      COAST
      2013
      8
      13
      17
      18779.025510
    
    
      1
      EAST
      2013
      8
      5
      17
      2380.165409
    
    
      2
      FAR_WEST
      2013
      6
      26
      17
      2281.272214
    
    
      3
      NORTH
      2013
      8
      7
      17
      1544.770714
    
    
      4
      NORTH_C
      2013
      8
      7
      18
      24415.570227



In [115]:

    
# write file to local file system
#output.to_csv("output.csv", sep="|")

Handle JSON

This exercise shows some important concepts that you should be aware about:

using codecs module to write unicode files
using authentication with web APIs
using offset when accessing web APIs

To run this code locally you have to register at the NYTimes developer site and get your own API key. You will be able to complete this exercise in our UI without doing so, as we have provided a sample result.

Your task is to process the saved file that represents the most popular articles (by view count) from the last day, and return the following data:

list of dictionaries, where the dictionary key is "section" and value is "title"
list of URLs for all media entries with "format": "Standard Thumbnail"

All your changes should be in the article_overview function. The rest of functions are provided for your convenience, if you want to access the API by yourself.



In [ ]:

    
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}


def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())


def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls =[]
    
    for article in data:
        section = article["section"]
        title = article["title"]
        titles.append({section: title})
        if "media" in article:
            for m in article["media"]:
                for mm in m["media-metadata"]:
                    if mm["format"] == "Standard Thumbnail":
                        urls.append(mm["url"])

    return (titles, urls)


def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print "You need to register for NYTimes Developer account to run this program."
        print "See Intructor notes for information"
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print "Time period can be 1,7, 30 days only"
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print "kind can be only one of viewed/shared/emailed"
        return False

    url += "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data


def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))


def test():
    titles, urls = article_overview("viewed", 1)
    assert len(titles) == 20
    assert len(urls) == 30
    assert titles[2] == {'Opinion': 'Professors, We Need You!'}
    assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'

	Hour_End	COAST	EAST	FAR_WEST	NORTH	NORTH_C	SOUTHERN	SOUTH_C	WEST	ERCOT
0	2013-01-01 01:00:00	7606.263544	1073.892438	1411.750567	784.978166	10369.094390	2206.675077	4368.490945	882.931901	28704.077028
1	2013-01-01 02:00:00	7388.082714	1035.021938	1403.472287	776.307387	10152.358518	2159.733208	4233.587967	872.404750	28020.968769
2	2013-01-01 03:00:00	7178.867878	1036.088697	1395.053150	768.125748	9988.051418	2065.114706	4082.862860	868.853938	27383.018395
3	2013-01-01 04:00:00	7038.822581	1032.648841	1395.508820	770.937969	9946.658655	1990.903699	4010.489608	865.701201	27051.671374
4	2013-01-01 05:00:00	6990.857940	1042.823044	1401.216842	779.089313	10096.664190	1954.807585	4038.655997	879.924249	27184.039160

	Date (MM/DD/YYYY)_Time (HH:MM)	GHI source	DNI source	...	AOD (unitless)	AOD source	AOD uncert (code)	Alb (unitless)	Alb source	Alb uncert (code)	Lprecip quantity (hr)	Lprecip source	Lprecip uncert (code)
0	01/01/2005 01:00	2	2	...	0.099	F	8	0.16	F	8	1	A	7
1	01/01/2005 02:00	2	2	...	0.099	F	8	0.16	F	8	1	A	7
2	01/01/2005 03:00	2	2	...	0.099	F	8	0.16	F	8	1	A	7
3	01/01/2005 04:00	2	2	...	0.099	F	8	0.16	F	8	1	A	7
4	01/01/2005 05:00	2	2	...	0.099	F	8	0.16	F	8	1	A	7

	station	year	month	day	hour	load
0	COAST	2013	8	13	17	18779.025510
1	EAST	2013	8	5	17	2380.165409
2	FAR_WEST	2013	6	26	17	2281.272214
3	NORTH	2013	8	7	17	1544.770714
4	NORTH_C	2013	8	7	18	24415.570227