Tools for NOAA TMAX Record Frequency Analysis


In [3]:
# <help>

In [1]:
# <api>
from IPython.display import display, Image
from IPython.html.widgets import interact_manual
import matplotlib.pyplot as plt
import os
import struct
import glob
import seaborn as sns
import pandas as pd
import datetime as dt

## Use this global variable to specify the path for station summary files.
NOAA_STATION_SUMMARY_PATH = "/resources/noaa-hdta/data/derived/15mar2015/summaries/"

## Build Station List
station_detail_colnames = ['StationID','State','Name',
                            'Latitude','Longitude','QueryTag']

station_detail_rec_template = {'StationID': "",
                                'State': "",
                                'Name': "",
                                'Latitude': "",
                                'Longitude': "",
                                'QueryTag': ""
                                }

STATION_DETAIL = '/resources/ghcnd-stations.txt'

def get_filename(pathname):
    '''Fetch filename portion of pathname.'''
    plist = pathname.split('/')
    fname, fext = os.path.splitext(plist[len(plist)-1])
    return fname

def fetch_station_list():
    station_list = []
    raw_files = os.path.join(NOAA_STATION_SUMMARY_PATH,'','*_sum.csv')
    for index, fname in enumerate(glob.glob(raw_files)):
        f = get_filename(fname).split('_')[0]
        station_list.append(str(f))
    return station_list

def process_station_detail(fname,stations): 
    '''Return dataframe of station detail.'''
    station_list = []
    with open(fname,'r') as f:
        lines = f.readlines()
        f.close()
        for line in lines:
            r = noaa_gather_station_detail(line,stations)
            station_list += r
    return pd.DataFrame(station_list,columns=station_detail_colnames)
      
def noaa_gather_station_detail(line,slist):
    '''Build a list of stattion tuples.'''
    station_tuple_list = []
    station_id_key = line[0:3]
    if station_id_key == 'USC' or station_id_key == 'USW': 
        fields = struct.unpack('12s9s10s7s2s30s', line[0:70])
        if fields[0].strip() in slist:
            station_tuple = dict(station_detail_rec_template)
            station_tuple['StationID'] = fields[0].strip()
            station_tuple['State'] = fields[4].strip()
            station_tuple['Name'] = fields[5].strip()
            station_tuple['Latitude'] = fields[1].strip()
            station_tuple['Longitude'] = fields[2].strip()
            qt = "{0} at {1} in {2}".format(fields[0].strip(),fields[5].strip(),fields[4].strip())
            station_tuple['QueryTag'] = qt
            station_tuple_list.append(station_tuple)
    return station_tuple_list

# Exploration Widget
month_abbrev = { 1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr',
                    5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug',
                    9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
                }

def compute_years_of_station_data(df):
    yrs = dt.date.today().year-min(df['FirstYearOfRecord'])
    print("This weather station has been in service and collecting data for {0} years.").format(yrs)
    return
    
def compute_tmax_record_quantity(df,freq):
    threshold = int(freq)
    df_result = df.query('(TMaxRecordCount > @threshold)')
    return df_result
    
def fetch_station_data(stationid):
    fname = os.path.join(NOAA_STATION_SUMMARY_PATH,'',stationid+'_sum.csv')
    return pd.DataFrame.from_csv(fname)

def create_day_identifier(month,day):
    return str(day)+'-'+month_abbrev[int(month)]
    
def create_date_list(mlist,dlist):
    mv = mlist.values()
    dv = dlist.values()
    new_list = []
    for index, value in enumerate(mv):
        new_list.append(create_day_identifier(value,dv[index]))
    return new_list

def create_record_date_list(mlist,dlist,ylist):
    mv = mlist.values()
    dv = dlist.values()
    yv = ylist.values()
    new_list = []
    for index, value in enumerate(mv):      
        new_list.append(dt.date(yv[index],value,dv[index]))
    return new_list

def compute_max_record_durations(df):
    dates = create_date_list(df['Month'].to_dict(),df['Day'].to_dict())
    s_dates = pd.Series(dates)
    s_values = pd.Series(df['MaxDurTMaxRecord'].to_dict().values())
    df_new = pd.concat([s_dates, s_values], axis=1)
    df_new.columns = {"Duration","Date"}
    return df_new

def plot_tmax_record_results(df):
    dates = create_record_date_list(df['Month'].to_dict(),
                                    df['Day'].to_dict(),
                                    df['TMaxRecordYear'].to_dict()
                                   )
    s_dates = pd.Series(dates)
    s_tempvalues = pd.Series(df['TMax'].to_dict().values())
    df_new = pd.concat([s_dates,s_tempvalues], axis=1)
    df_new.columns = {"RecordDate","RecordHighTemp"}
    plt.figure(figsize = (9,9), dpi = 72)
    plt.xticks(rotation=90)
    sns.pointplot(df_new["RecordDate"],df_new["RecordHighTemp"])
    return df_new

def plot_duration_results(df):
    plt.figure(figsize = (9,9), dpi = 72)
    plt.xlabel('Day')
    plt.ylabel('Record Duration in Years')
    plt.title('Maximum Duration for TMax Records')
    ax = plt.gca()
    colors= ['r', 'b']
    df.plot(kind='bar',color=colors, alpha=0.75, ax=ax)
    ax.xaxis.set_ticklabels( ['%s'  % i for i in df.Date.values] )
    plt.grid(b=True, which='major', linewidth=1.0)
    plt.grid(b=True, which='minor')
    return

def explore_tmaxfreq(station,hirec):
    df_station_detail = fetch_station_data(station)
    df_station_address_detail = process_station_detail(STATION_DETAIL,fetch_station_list())
    df_station_name = df_station_address_detail.query("(StationID == @station)")
    qt = df_station_name.iloc[0]["QueryTag"]
    print("Historical high temperature record analysis for weather station {0}.").format(qt)
    display(df_station_name)
    print("Station detail, quick glimpse.")
    display(df_station_detail.head())    
    compute_years_of_station_data(df_station_detail)
    df_record_days = compute_tmax_record_quantity(df_station_detail,hirec)
    if not df_record_days.empty:
        print("This station has experienced {0} days of new record highs where a new record has been set more than {1} times throughout the operation of the station.").format(len(df_record_days),hirec)
        display(df_record_days.head(10))
        print("Displayed above are the details for up to the first 10 new record high events. All records are ploted below.")
        df_rec_results = plot_tmax_record_results(df_record_days)
        df_durations = compute_max_record_durations(df_record_days)
        plot_duration_results(df_durations)
    else:
        print("This weather station has not experienced any days with greater than {0} new record highs.").format(hirec)
    return 
        
def noaaquery(renderer=lambda station,hirec : explore_tmaxfreq(station,hirec)):
    '''
    Creates an interactive query widget with an optional custom renderer.
    
    station: Weather Station ID
    hirec: Query indicator for the maximum number of TMAX records for a given day.
    '''
    df_station_detail = process_station_detail(STATION_DETAIL,fetch_station_list())
    station_vals = tuple(df_station_detail.StationID)
    hirec_vals = tuple(map(str, range(1,51)))

    @interact_manual(station=station_vals, hirec=hirec_vals)
    def noaaquery(station, hirec):
        '''Inner function that gets called when the user interacts with the widgets.'''
        try:
            station_id = station.strip()
            high_rec_freq = hirec
        except Exception as e:
            print("Widget Error: {0}").format(e.message)
        renderer(station_id, high_rec_freq)


:0: FutureWarning: IPython widgets are experimental and may change in the future.