In [1]:
import pandas as pd
import numpy as np
from pandas.io import json
import requests
import os
import sys
import string
In [2]:
NOAA_Token_Here= 'enter as string'
Play with some basic functions adapted from tide data functions
In [5]:
def query_builder(start_dt, end_dt, station, offset= 1):
"""Function accepts: a start and end datetime string in the form 'YYYYMMDD mm:ss'
which are <= 1 year apart, a station ID, and an offset.
Function assembles a query parameters/arguments dict and returns an API query and the
query dictionary (query_dict). The relevant base URL is the NCDC endpoint
'http://www.ncdc.noaa.gov/cdo-web/api/v2/data?'."""
import urllib
# API endpoint
base_url= 'http://www.ncdc.noaa.gov/cdo-web/api/v2/data?'
# dict of NOAA query parameters/arguments
query_dict = dict(startdate= start_dt, enddate= end_dt, stationid= station,
offset= offset, datasetid= 'GHCND', limit= 1000)
# encode arguments
encoded_args = urllib.urlencode(query_dict)
# query
query = base_url + encoded_args
# decode url % (reconvert reserved characters to utf8 string)
query= urllib.unquote(query)
# create and return query from base url and encoded arguments
return query, query_dict
In [6]:
query_1, query_dict= query_builder('2014-01-01', '2015-01-01', station= 'GHCND:USW00023174')
print(query_1)
In [7]:
query_2, query_dict= query_builder('2014-01-01', '2015-01-01', station= 'GHCND:USW00023174', offset= 1001)
print(query_2)
In [8]:
def offsetter(response):
"""
Function accepts a restful query response (JSON)
Function returns a dictionary of offsets to pull the entire query set
where the set is limited to 1000 records per query. Function also
returns a record count for use in validation.
"""
# get repeats and repeat range
import math
count= response['metadata']['resultset']['count']
repeats= math.ceil(count/1000.)
repeat_range= range(int(repeats))
# get offsets dictionary
offset= 1
offsets= [1]
for item in repeat_range[1:]:
offset += 1000
offsets.append(offset)
# zip up the results and convert to dictionary
offset_dict= dict(zip(repeat_range[1:], offsets[1:])) # the first call has been done already to get meta
return offset_dict, count # for quality control
In [13]:
def execute_query(query):
"""
Function accepts an NOAA query for daily summaries for a specfic location
and executes the query.
Function returns a response (JSON)
"""
url = query
# replace token with token provided by NOAA. Enter token as string
headers = {'token': NOAA_Token_Here} # https://www.ncdc.noaa.gov/cdo-web/token
response = requests.get(url, headers = headers)
response = response.json()
return response
In [14]:
working_1= execute_query(query_1)['results']
working_2 = execute_query(query_2)['results']
In [12]:
def extract_results(response):
"""
Function accepts a NOAA query response (JSON) return the results
key values as well as the number of records (for use in validation).
"""
data= response['results']
# for quality control to verify retrieval of all rows
length= len(data)
return data, length
In [13]:
def collator(results):
"""
Functions accepts the results key of an NOAA query response (JSON)
and returns a tidy data set in PANDAS, where each record is an
observation about a day.
"""
df= pd.DataFrame(results)
df= df.drop(['attributes','station'], axis=1)
df= df.pivot(index= 'date',columns= 'datatype', values= 'value').reset_index()
return df
In [20]:
def get_ncdc(start_dt, end_dt, station):
"""
Function accepts a start date (MM-DD-YYY) an end date (MM-DD-YYYY)
and a NOAA station ID. Date limit is 1 year.
Function returns a tidy dataset in a PANDAS DataFrame where
each row represents an observation about a day, a record count
and a query parameters dictionary.
"""
# count for verifying retrieval of all rows
record_count= 0
# initial query
query, query_dict= query_builder(start_dt, end_dt, station)
response= execute_query(query)
# extract results and count
results, length= extract_results(response)
record_count += length
# get offsets for remaining queries
off_d, count= offsetter(response)
# execute remaining queries and operations
for offset in off_d:
query, _= query_builder(start_dt, end_dt, station, off_d[offset])
print(query)
response= execute_query(query)
next_results, next_length= extract_results(response)
record_count += next_length
# concat results lists
results += next_results
assert record_count == count, 'record count != count'
collated_data= collator(results)
return collated_data, record_count, query_dict
In [21]:
test, qc, params = get_ncdc('2014-01-01', '2014-12-31', station= 'GHCND:USW00023174')
In [16]:
test.date.head()
Out[16]:
In [17]:
test.date.tail()
Out[17]:
In [18]:
test.info()
In [19]:
test[test.date.isnull()]
Out[19]:
In [ ]:
y1, qc, params = get_ncdc('2014-05-03', '2015-05-02', station= 'GHCND:USW00023174')
y2, qc, params = get_ncdc('2015-05-03', '2016-05-02', station= 'GHCND:USW00023174')
y3, qc, params = get_ncdc('2016-05-03', '2017-05-02', station= 'GHCND:USW00023174')
In [ ]:
y1.info()
In [ ]:
years= pd.concat([y1, y2, y3])
In [ ]:
years.date.head()
In [ ]:
years.date.tail()
In [ ]:
years.to_csv('LAX_3years.csv', index= False)
In [84]:
def gen_csv(df, query_dict):
"""
Arguments: PANDAS DataFrame, a query parameters dictionary
Returns: A CSV of the df with dropped index and named by dict params
"""
# extract params
station= query_dict['stationid']
start= query_dict['startdate']
end= query_dict['enddate']
# using os.path in case of future expansion to other directories
path= os.path.join(station + '_' + start + '_' + end + '.' + 'csv')
# remove problem characters (will add more in future)
exclude_chars= ':'
path= path.replace(exclude_chars, "_")
# export to csv
my_csv= df.to_csv(path, index= False)
return my_csv, path
In [85]:
stuff, path= gen_csv(test, query_dict)
In [86]:
path
Out[86]:
In [87]:
ls *csv
In [18]:
#!/usr/bin/env python
# coding: utf-8
"""Python code for querying NOAA daily summary weather and returnig a CSV per year
for a specfic station. Code is intended to be executed from CLI."""
import sys
# set path to tools library and import
sys.path.append(r'noaa_weather_tools')
import noaa_weather_tools
NOAA_Token_Here= 'enter token as string'
print("Check dt format('DD-MM-YYYY', and whether dates span <= 1 year from a current or past date")
print("If dates exceed one year, NCDC query returns a null object")
print("Need a token take a token, have a token, keep it to yourself @ https://www.ncdc.noaa.gov/cdo-web/token")
print('start_dt: {}\n end_dt: {}'.format(sys.argv[1], sys.argv[2]))
def noaa_dailysum_weather_processor(start_dt, end_dt, station):
"""Function accepts a station ID, and beginning/end datetime as strings with date format as
'MM-DD-YYYY' which span <= 1 year from a current or past date, passing them to the query_builder function.
Function creates a .csv file of NOAA (NCDC) Daily Summary data for a specific station."""
print(15 * '.' + "reticulating splines" + 5* '.' + "getting records")
df, record_count, query_parameters= noaa_weather_tools.get_ncdc(start_dt, end_dt, station)
print(15* '.' + "exporting to csv")
my_csv, my_path= noaa_weather_tools.gen_csv(df, query_parameters)
print("spines reticulated")
return my_csv
In [15]:
noaa_dailysum_weather_processor('2014-05-03', '2015-05-02', station= 'GHCND:USW00023174')
In [13]:
ls *csv
def collator(response):
data= pd.DataFrame(response['results'])
# for quality control to verify retrieval of all rows
length= len(data)
data= data.drop(['attributes','station'], axis=1)
data= data.pivot(index= 'date',columns= 'datatype', values= 'value').reset_index()
return data, length
def get_ncdc(start_dt, end_dt, station):
# count for verifying retrieval of all rows
row_count= 0
# initial query
query, query_dict= query_builder(start_dt, end_dt, station)
response= execute_query(query)
# collate and count
collated_data, length= collator(response)
row_count += length
# get offsets for remaining queries
off_d, count= offsetter(response)
# execute remaining queries and operations
for offset in off_d:
query, _= query_builder(start_dt, end_dt, station, off_d[offset])
print(query)
response= execute_query(query)
next_data, next_length= collator(response)
row_count += next_length
# stack DataFrames
collated_data= pd.concat([collated_data, next_data])
assert row_count == count, 'row count != count'
return collated_data, row_count