In [1]:
import pandas as pd
import numpy as np
from pandas.io import json
import requests
import os
import sys
import string

In [2]:
NOAA_Token_Here= 'enter as string'

Play with some basic functions adapted from tide data functions

Query Builder


In [5]:
def query_builder(start_dt, end_dt, station, offset= 1):

    """Function accepts: a start and end datetime string in the form 'YYYYMMDD mm:ss'
    which are <= 1 year apart, a station ID, and an offset. 
    Function assembles a query parameters/arguments dict and returns an API query and the 
    query dictionary (query_dict). The relevant base URL is the NCDC endpoint 
    'http://www.ncdc.noaa.gov/cdo-web/api/v2/data?'."""

    import urllib
    
    # API endpoint
    base_url= 'http://www.ncdc.noaa.gov/cdo-web/api/v2/data?'

    # dict of NOAA query parameters/arguments

    query_dict = dict(startdate= start_dt, enddate= end_dt, stationid= station,
                      offset= offset, datasetid= 'GHCND', limit= 1000)

    # encode arguments

    encoded_args = urllib.urlencode(query_dict)
    
    # query
    query = base_url + encoded_args
    
    # decode url % (reconvert reserved characters to utf8 string)
    query= urllib.unquote(query)

    # create and return query from base url and encoded arguments
    return query, query_dict

In [6]:
query_1, query_dict= query_builder('2014-01-01', '2015-01-01', station= 'GHCND:USW00023174')
print(query_1)


http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2015-01-01&offset=1&limit=1000&datasetid=GHCND

In [7]:
query_2, query_dict= query_builder('2014-01-01', '2015-01-01', station= 'GHCND:USW00023174', offset= 1001)
print(query_2)


http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2015-01-01&offset=1001&limit=1000&datasetid=GHCND

Offset Generator


In [8]:
def offsetter(response):
    
    """
    Function accepts a restful query response (JSON)
    Function returns a dictionary of offsets to pull the entire query set
    where the set is limited to 1000 records per query. Function also 
    returns a record count for use in validation.
    """
    
    # get repeats and repeat range
    import math
    count= response['metadata']['resultset']['count']
    repeats= math.ceil(count/1000.)
    repeat_range= range(int(repeats))
    
    # get offsets dictionary
    
    offset= 1
    offsets= [1]
    for item in repeat_range[1:]:
        offset += 1000
        offsets.append(offset)
        
    
    # zip up the results and convert to dictionary
    offset_dict= dict(zip(repeat_range[1:], offsets[1:])) # the first call has been done already to get meta
    
    return offset_dict, count # for quality control

Query Generator

TODO

  • refactor with a decorator
  • make key an attribute that can be hidden

In [13]:
def execute_query(query):
    
    """
    Function accepts an NOAA query for daily summaries for a specfic location
    and executes the query.
    Function returns a response (JSON)
    """
    url = query
    # replace token with token provided by NOAA.  Enter token as string
    headers = {'token': NOAA_Token_Here} # https://www.ncdc.noaa.gov/cdo-web/token
    response = requests.get(url, headers = headers)
    response = response.json()
    
    return response

In [14]:
working_1= execute_query(query_1)['results']
working_2 = execute_query(query_2)['results']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-09badf1aeec9> in <module>()
----> 1 working_1= execute_query(query_1)['results']
      2 working_2 = execute_query(query_2)['results']

NameError: name 'query_1' is not defined

Extract Results


In [12]:
def extract_results(response):
    
    """
    Function accepts a NOAA query response (JSON) return the results
    key values as well as the number of records (for use in validation).
    """
    data= response['results']
    # for quality control to verify retrieval of all rows
    length= len(data)
    
    return data, length

In [13]:
def collator(results):
    
    """
    Functions accepts the results key of an NOAA query response (JSON)
    and returns a tidy data set in PANDAS, where each record is an 
    observation about a day.
    """
    
    df= pd.DataFrame(results)    
    df= df.drop(['attributes','station'], axis=1)
    df= df.pivot(index= 'date',columns= 'datatype', values= 'value').reset_index()
    
    return df

In [20]:
def get_ncdc(start_dt, end_dt, station):
    
    """
    Function accepts a start date (MM-DD-YYY) an end date (MM-DD-YYYY)
    and a NOAA station ID.  Date limit is 1 year.
    Function returns a tidy dataset in a PANDAS DataFrame where
    each row represents an observation about a day, a record count
    and a query parameters dictionary.
    """
    
    
    # count for verifying retrieval of all rows
    record_count= 0
    # initial query
    query, query_dict= query_builder(start_dt, end_dt, station)
    response= execute_query(query)
    
    #  extract results and count    
    results, length= extract_results(response)
    record_count += length
    
    # get offsets for remaining queries
    off_d, count= offsetter(response)
    
    # execute remaining queries and operations
    for offset in off_d:
        query, _= query_builder(start_dt, end_dt, station, off_d[offset])
        print(query)
        response= execute_query(query)
        next_results, next_length= extract_results(response)
        
        record_count += next_length
        
        # concat results lists
        results += next_results
        
    assert record_count == count, 'record count != count'
    
    collated_data= collator(results)
        
    return collated_data, record_count, query_dict

In [21]:
test, qc, params = get_ncdc('2014-01-01', '2014-12-31', station= 'GHCND:USW00023174')


http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2014-12-31&offset=1001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2014-12-31&offset=2001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2014-12-31&offset=3001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-01-01&stationid=GHCND:USW00023174&enddate=2014-12-31&offset=4001&limit=1000&datasetid=GHCND

In [16]:
test.date.head()


Out[16]:
0    2014-01-01T00:00:00
1    2014-01-02T00:00:00
2    2014-01-03T00:00:00
3    2014-01-04T00:00:00
4    2014-01-05T00:00:00
Name: date, dtype: object

In [17]:
test.date.tail()


Out[17]:
360    2014-12-27T00:00:00
361    2014-12-28T00:00:00
362    2014-12-29T00:00:00
363    2014-12-30T00:00:00
364    2014-12-31T00:00:00
Name: date, dtype: object

In [18]:
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 16 columns):
date    365 non-null object
AWND    365 non-null float64
PRCP    365 non-null float64
SNOW    357 non-null float64
SNWD    360 non-null float64
TAVG    365 non-null float64
TMAX    365 non-null float64
TMIN    365 non-null float64
WDF2    365 non-null float64
WDF5    354 non-null float64
WSF2    365 non-null float64
WSF5    354 non-null float64
WT01    94 non-null float64
WT02    5 non-null float64
WT03    1 non-null float64
WT08    37 non-null float64
dtypes: float64(15), object(1)
memory usage: 45.7+ KB

In [19]:
test[test.date.isnull()]


Out[19]:
datatype date AWND PRCP SNOW SNWD TAVG TMAX TMIN WDF2 WDF5 WSF2 WSF5 WT01 WT02 WT03 WT08

In [ ]:
y1, qc, params = get_ncdc('2014-05-03', '2015-05-02', station= 'GHCND:USW00023174')
y2, qc, params = get_ncdc('2015-05-03', '2016-05-02', station= 'GHCND:USW00023174')
y3, qc, params = get_ncdc('2016-05-03', '2017-05-02', station= 'GHCND:USW00023174')

In [ ]:
y1.info()

In [ ]:
years= pd.concat([y1, y2, y3])

In [ ]:
years.date.head()

In [ ]:
years.date.tail()

In [ ]:
years.to_csv('LAX_3years.csv', index= False)

CSV Generator


In [84]:
def gen_csv(df, query_dict):
    """
    Arguments: PANDAS DataFrame, a query parameters dictionary
    Returns: A CSV of the df with dropped index and named by dict params
    """
    
    # extract params
    station= query_dict['stationid']
    start= query_dict['startdate']
    end= query_dict['enddate']
    
    # using os.path in case of future expansion to other directories
    path= os.path.join(station + '_' + start + '_' + end + '.' + 'csv')
    
    # remove problem characters (will add more in future)
    exclude_chars= ':'
    path= path.replace(exclude_chars, "_")
    
    # export to csv
    
    my_csv= df.to_csv(path, index= False)
    
    return my_csv, path

In [85]:
stuff, path= gen_csv(test, query_dict)

In [86]:
path


Out[86]:
'GHCND_USW00023174_2014-01-01_2015-01-01.csv'

In [87]:
ls *csv


 Volume in drive C has no label.
 Volume Serial Number is CE97-BE73

 Directory of C:\Users\Andrew\Documents\noaa_requests

05/12/2017  12:35 AM            30,420 GHCND_USW00023174_2014-01-01_2015-01-01.csv
               1 File(s)         30,420 bytes
               0 Dir(s)  685,094,088,704 bytes free

In [18]:
#!/usr/bin/env python
# coding: utf-8


"""Python code for querying NOAA daily summary weather and returnig a CSV per year
for a specfic station.  Code is intended to be executed from CLI."""

import sys

# set path to tools library and import
sys.path.append(r'noaa_weather_tools')
import noaa_weather_tools

NOAA_Token_Here= 'enter token as string'

print("Check dt format('DD-MM-YYYY', and whether dates span <= 1 year from a current or past date")
print("If dates exceed one year, NCDC query returns a null object")
print("Need a token take a token, have a token, keep it to yourself @ https://www.ncdc.noaa.gov/cdo-web/token")
print('start_dt: {}\n end_dt: {}'.format(sys.argv[1], sys.argv[2]))


def noaa_dailysum_weather_processor(start_dt, end_dt, station):

    """Function accepts a station ID, and beginning/end datetime as strings with date format as
    'MM-DD-YYYY' which span <= 1 year from a current or past date, passing them to the query_builder function. 
    Function creates a .csv file of NOAA (NCDC) Daily Summary data for a specific station."""
    
    print(15 * '.' + "reticulating splines" + 5* '.' + "getting records")    
    df, record_count, query_parameters= noaa_weather_tools.get_ncdc(start_dt, end_dt, station)
    
    print(15* '.' + "exporting to csv")
    my_csv, my_path= noaa_weather_tools.gen_csv(df, query_parameters)
    
    print("spines reticulated")
    return my_csv


Check dt format('DD-MM-YYYY', and whether dates span <= 1 year from a current or past date
If dates exceed one year, NCDC query returns a null object
Need a token take a token, have a token, keep it to yourself @ https://www.ncdc.noaa.gov/cdo-web/token
start_dt: -f
 end_dt: C:\Users\vhim98198\AppData\Roaming\jupyter\runtime\kernel-9420aae1-29a1-4c51-ae89-41b7fd679e89.json

In [15]:
noaa_dailysum_weather_processor('2014-05-03', '2015-05-02', station= 'GHCND:USW00023174')


...............reticulating splines.....getting records
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-05-03&stationid=GHCND:USW00023174&enddate=2015-05-02&offset=1001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-05-03&stationid=GHCND:USW00023174&enddate=2015-05-02&offset=2001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-05-03&stationid=GHCND:USW00023174&enddate=2015-05-02&offset=3001&limit=1000&datasetid=GHCND
http://www.ncdc.noaa.gov/cdo-web/api/v2/data?startdate=2014-05-03&stationid=GHCND:USW00023174&enddate=2015-05-02&offset=4001&limit=1000&datasetid=GHCND
...............exporting to csv
spines reticulated

In [13]:
ls *csv


 Volume in drive C is Acer
 Volume Serial Number is 3829-CAE6

 Directory of C:\Users\vhim98198\Documents\noaa_requests

File Not Found

Discarded Functions

def collator(response):

    data= pd.DataFrame(response['results'])
    # for quality control to verify retrieval of all rows
    length= len(data)

    data= data.drop(['attributes','station'], axis=1)
    data= data.pivot(index= 'date',columns= 'datatype', values= 'value').reset_index()

    return data, length

def get_ncdc(start_dt, end_dt, station):


    # count for verifying retrieval of all rows
    row_count= 0
    # initial query
    query, query_dict= query_builder(start_dt, end_dt, station)
    response= execute_query(query)

    # collate and count    
    collated_data, length= collator(response)
    row_count += length

    # get offsets for remaining queries
    off_d, count= offsetter(response)

    # execute remaining queries and operations
    for offset in off_d:
        query, _= query_builder(start_dt, end_dt, station, off_d[offset])
        print(query)
        response= execute_query(query)
        next_data, next_length= collator(response)

        row_count += next_length

        # stack DataFrames
        collated_data= pd.concat([collated_data, next_data])

    assert row_count == count, 'row count != count'

    return collated_data, row_count