In [1]:
import pandas as pd
import datetime
import os

DATA_DIR = '../data/raw/'
#The files are given as fixed width files.
#See Documentation in ../data/

#All columns in the WY2 files.  Most of this information isn't useful to me
#WY2_colspecs = [(0, 5), (5, 6), (6, 16), (16, 20), (20, 24),
#                (24, 26), (26, 30), (30, 32), (32, 36), (36, 38),
#                (38, 42), (42, 43), (43, 47), (47, 48), (48, 52),
#                (52, 53), (53, 57), (57, 58), (58, 60), (60, 61),
#                (61, 65), (65, 66), (66, 70), (70, 71), (71, 75),
#                (75, 76), (76, 84), (84, 85), (85, 90), (90, 91),
#                (91, 92), (92, 95), (95, 96), (96, 100), (100, 101),
#                (101, 104), (104, 105), (105, 109), (109, 110),
#                (110, 112), (112, 113), (113, 115), (115, 116),
#                (116, 117), (117, 118)]

There is a metadata file that contains lat long coordinates of stations, identified by their WBAN number.

Further, the time recordings at each station are given in "local standard time" (LST), and the mlong coordinate provides a means to convert to UTC (universal time): LST = UTC - MLONG/15, or UTC = LST + MLONG/15 (all units in hours). I will convert all of the times into UTC


In [2]:
col_names = ['Name',  'WBAN',   'lat',    'long',  'mlong', 'first_year', 'last_year']
loc_cols =  [(0,24), (24, 30), (45, 52), (52, 58), (59, 65),  (74, 76),     (77, 79)]

In [3]:
D_loc = pd.read_fwf(DATA_DIR + 'locations.txt', colspecs = loc_cols, comment = '#',
                    header = None, names = col_names)

In [4]:
len(D_loc) #Total number of time series records.


Out[4]:
235

In [5]:
D_loc.head()


Out[5]:
Name WBAN lat long mlong first_year last_year
0 CALGARY INT'L. A 25110 51.10 114.02 105.0 53 5
1 COLD LAKE A 25129 54.42 110.28 105.0 54 5
2 CORONATION 25113 52.10 111.45 105.0 53 94
3 COWLEY A CAN43 49.63 114.08 105.0 53 59
4 EDMONTON INT'L. A 25142 53.32 113.58 105.0 61 5

In [6]:
#The year is specified only with the last 2 digits
#but, data collection started in after 1950 and ended before 2050
def fix_year(yr : int):
    if yr > 50:
        yr += 1900
    else:
        yr += 2000
    return yr

In [7]:
D_loc.loc[:, ['first_year', 'last_year']] = D_loc.loc[:, ['first_year', 'last_year']].applymap(fix_year)

In [8]:
D_loc.head()


Out[8]:
Name WBAN lat long mlong first_year last_year
0 CALGARY INT'L. A 25110 51.10 114.02 105.0 1953 2005
1 COLD LAKE A 25129 54.42 110.28 105.0 1954 2005
2 CORONATION 25113 52.10 111.45 105.0 1953 1994
3 COWLEY A CAN43 49.63 114.08 105.0 1953 1959
4 EDMONTON INT'L. A 25142 53.32 113.58 105.0 1961 2005

In [9]:
def time_correction(mlong : float):
    '''The time delta to add to an LST time to yield a UTC time,
    given the prime meridian mlong in degrees.'''
    return datetime.timedelta(hours = mlong / 15)

In [10]:
D_loc['time_correction'] = D_loc.loc[:, 'mlong'].apply(time_correction)

In [11]:
D_loc.head()


Out[11]:
Name WBAN lat long mlong first_year last_year time_correction
0 CALGARY INT'L. A 25110 51.10 114.02 105.0 1953 2005 07:00:00
1 COLD LAKE A 25129 54.42 110.28 105.0 1954 2005 07:00:00
2 CORONATION 25113 52.10 111.45 105.0 1953 1994 07:00:00
3 COWLEY A CAN43 49.63 114.08 105.0 1953 1959 07:00:00
4 EDMONTON INT'L. A 25142 53.32 113.58 105.0 1961 2005 07:00:00

In [12]:
del D_loc['mlong'] #No longer needed

In [13]:
D_loc.head()


Out[13]:
Name WBAN lat long first_year last_year time_correction
0 CALGARY INT'L. A 25110 51.10 114.02 1953 2005 07:00:00
1 COLD LAKE A 25129 54.42 110.28 1954 2005 07:00:00
2 CORONATION 25113 52.10 111.45 1953 1994 07:00:00
3 COWLEY A CAN43 49.63 114.08 1953 1959 07:00:00
4 EDMONTON INT'L. A 25142 53.32 113.58 1961 2005 07:00:00

In [14]:
def wban_fname(wban):
    '''Convert the WBAN string into the filename we need to look for'''
    for root, dirs, files in os.walk(DATA_DIR):
        for f in files:
            if f.endswith('WY2') and f.startswith(wban):
                return root + '/' + f
    return 404
%timeit wban_fname('xxx') #check time to traverse whole directory structure


2.91 ms ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [15]:
D_loc['WBAN_file'] = D_loc['WBAN'].apply(wban_fname)

In [16]:
D_loc.head()


Out[16]:
Name WBAN lat long first_year last_year time_correction WBAN_file
0 CALGARY INT'L. A 25110 51.10 114.02 1953 2005 07:00:00 ../data/raw/ALBERTA/CalgaryInt'l.A_1953-2005/2...
1 COLD LAKE A 25129 54.42 110.28 1954 2005 07:00:00 ../data/raw/ALBERTA/ColdLakeA_1954-2005/25129.WY2
2 CORONATION 25113 52.10 111.45 1953 1994 07:00:00 ../data/raw/ALBERTA/Coronation_1953-1994/25113...
3 COWLEY A CAN43 49.63 114.08 1953 1959 07:00:00 ../data/raw/ALBERTA/CowleyA_1953-1959/CAN43.WY2
4 EDMONTON INT'L. A 25142 53.32 113.58 1961 2005 07:00:00 ../data/raw/ALBERTA/EdmontonInt'l.A_1961-2005/...

In [17]:
WY2_cols = [(6, 16), (91, 95), (95, 96)] #Time, temperature, temp flag
col_names = ['Time', 'T', 'T_flag'] #Flags indicate missing or estimated data

In [18]:
#Hours are from 01...24, so we must subtract 1 from the hour

#Parsing the date takes by far the most time
def process_wy2f(file_path, time_correction):
    #0: Observed
    #-1: Missing data
    #1: Algorithmically adjusted
    #2: Hand estimate
    #3: Interpolated
    #4: Determined from a model
    #5: Derived
    Tflags = {'': 0,'9': -1, 'A': 1, 'E': 2, 'I': 3, 'M': 4, 'Q': 5}
    convert_Tflag = lambda tf: Tflags[tf]
    #date_parser = lambda d: pd.to_datetime(str(int(d) - 1), format = '%Y%m%d%H') + td_1hr
    #This is about 5x faster
    date_parser = lambda d: datetime.datetime(int(d[0:4]), int(d[4:6]), int(d[6:8]), int(d[8:10]) - 1)
    D = pd.read_fwf(file_path, colspecs = WY2_cols, nrows = 100,
                header = None, names = col_names, parse_dates = ['Time'],
                converters = {'T_flag' : convert_Tflag},
                date_parser = date_parser)
    D['T_flag'].fillna('0')
    D.loc[:, 'T'] = D.loc[:, 'T'] / 10
    D.loc[:, 'Time'] = D.loc[:, 'Time'] + time_correction
    D.index = pd.DatetimeIndex(D.loc[:, 'Time'])
    del D['Time']
    return D

In [23]:
D = process_wy2f(D_loc['WBAN_file'][0], D_loc['time_correction'][0])
%timeit process_wy2f(D_loc['WBAN_file'][0], D_loc['time_correction'][0])


4.24 ms ± 19.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [24]:
D.head()


Out[24]:
T T_flag
Time
1953-01-01 07:00:00 -1.1 0
1953-01-01 08:00:00 -0.6 0
1953-01-01 09:00:00 0.0 0
1953-01-01 10:00:00 0.0 0
1953-01-01 11:00:00 -1.7 0

In [25]:
D.to_hdf('test.hdf', key = 'test')
D = pd.read_hdf('test.hdf', key = 'test')
D.head()


Out[25]:
T T_flag
Time
1953-01-01 07:00:00 -1.1 0
1953-01-01 08:00:00 -0.6 0
1953-01-01 09:00:00 0.0 0
1953-01-01 10:00:00 0.0 0
1953-01-01 11:00:00 -1.7 0

In [26]:
#Iterator on row pd.Series objects
#This is used with map to process
#all the files in parallel.
(r for i, r in D_loc.iterrows())


Out[26]:
<generator object <genexpr> at 0x7fc1a84a5048>

In [ ]: