The input data format is just a date and extent for each day we have data.
Year, Month, Day, Extent, Missing, Source Data
YYYY, MM, DD, 10^6 sq km, 10^6 sq km, Source data product web site: http://nsidc.org/d....
1978, 10, 26, 10.231, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
1978, 10, 28, 10.420, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
1978, 10, 30, 10.557, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
....
Start by downloading the daily sea ice extent data from NSIDC's website.
In [207]:
!mkdir -p ../data
!wget -P ../data -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_final.csv
!wget -P ../data -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_nrt.csv
!wget -P ../data -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_final.csv
!wget -P ../data -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_nrt.csv
In [208]:
import datetime as dt
import numpy as np
import os
import pandas as pd
from pandas import ExcelWriter
code to read the CSV files.
In [209]:
def parse_the_date(year, mm, dd):
return dt.date(int(year), int(mm), int(dd))
def slurp_csv(filename):
data = pd.read_csv(filename, header = None, skiprows=2,
names=["year", "mm", "dd", "extent", "missing", "source"],
parse_dates={'date':['year', 'mm', 'dd']},
date_parser=parse_the_date, index_col='date')
data = data.drop('missing', axis=1)
return data
def read_a_hemisphere(hemisphere):
the_dir = '../data'
final_prod_filename = os.path.join(the_dir, '{hemi}H_seaice_extent_final.csv'.format(hemi=hemisphere[0:1].upper()))
nrt_prod_filename = os.path.join(the_dir, '{hemi}H_seaice_extent_nrt.csv'.format(hemi=hemisphere[0:1].upper()))
final = slurp_csv(final_prod_filename)
nrt = slurp_csv(nrt_prod_filename)
all_data = pd.concat([final, nrt])
return all_data
Read CSV data
In [210]:
north = read_a_hemisphere('north')
south = read_a_hemisphere('south')
south.head()
Out[210]:
Add columns for year and month: We have do this because when we read the CSV file we converted the existing year/month/day columns into a python datetime object. also drop the source because we don't care where the data came from (near real time or production)
In [211]:
def add_year_month_columns(df):
a = df.copy()
a = a.drop('source',1)
a = a.reset_index()
a['year'] = pd.to_datetime(a.date).dt.year
a['month'] = pd.to_datetime(a.date).dt.month
a = a.set_index('date')
return a
In [212]:
north = add_year_month_columns(north)
south = add_year_month_columns(south)
In [213]:
north.head()
Out[213]:
In [214]:
south.head()
Out[214]:
Add 5 day rolling mean to the timesereis.
In [215]:
def add_rolling_mean(df, window=5, min_periods=2):
copy = df.copy()
# create an empty ts to align our extent data with
ts = pd.Series(np.nan, index=pd.date_range('1978-10-25', dt.date.today().strftime('%Y-%m-%d')))
copy.index = copy.index.to_datetime()
copy = df.align(ts, axis=0, join='right')[0]
df['5day-Avg'] = pd.rolling_mean(copy['extent'], window=5, min_periods=2)
return df
Want date back in the columns
In [216]:
north = add_rolling_mean(north)
south = add_rolling_mean(south)
In [217]:
north.head(1)
Out[217]:
In [218]:
north = north.reset_index()
south = south.reset_index()
north.head(1)
Out[218]:
Use a groupby to compute the row locations that represent the minimum and maximum extent and grab those rows into new variables. AKA: Filter out everything but the minimum/maximum extent for each month and year
In [219]:
def select_min_and_max_variable_rows_by_year_and_month(df, variable):
min_groups = df.loc[df.groupby(['year','month'])[variable].idxmin()][['date', variable, 'year', 'month']]
max_groups = df.loc[df.groupby(['year','month'])[variable].idxmax()][['date', variable, 'year', 'month']]
return {'min': min_groups, 'max': max_groups}
create dictionaries of max and min values for each hemisphere and for daily and rolling-mean
In [220]:
n = select_min_and_max_variable_rows_by_year_and_month(north, 'extent')
navg = select_min_and_max_variable_rows_by_year_and_month(north, '5day-Avg')
s = select_min_and_max_variable_rows_by_year_and_month(south, 'extent')
savg = select_min_and_max_variable_rows_by_year_and_month(south, '5day-Avg')
show that we have actually selected different data for daily and 5-average data
In [221]:
n['max'][3:5]
Out[221]:
In [222]:
navg['max'][3:5]
Out[222]:
In [223]:
def add_rank(df, rank_by, ascending):
df['rank'] = df.groupby('month')[rank_by].rank(ascending=ascending, method='first')
return df
add rank column for each month and hemsiphere's max and min:
In [224]:
n['max'] = add_rank(n['max'], 'extent', ascending=False)
n['min'] = add_rank(n['min'], 'extent', ascending=True)
s['max'] = add_rank(s['max'], 'extent', ascending=False)
s['min'] = add_rank(s['min'], 'extent', ascending=True)
navg['max'] = add_rank(navg['max'], '5day-Avg', ascending=False)
navg['min'] = add_rank(navg['min'], '5day-Avg', ascending=True)
savg['max'] = add_rank(savg['max'], '5day-Avg', ascending=False)
savg['min'] = add_rank(savg['min'], '5day-Avg', ascending=True)
In [225]:
def do_annual_min_max_ranking(df, field):
min_index = df.groupby(['year'])[field].idxmin()
max_index = df.groupby(['year'])[field].idxmax()
mindata = df.loc[min_index][['date', field]]
mindata['rank'] = mindata[field].rank(ascending=True, method='first')
maxdata = df.loc[max_index][['date', field]]
maxdata['rank'] = maxdata[field].rank(ascending=False, method='first')
mindata = mindata.set_index(pd.to_datetime(mindata.date).dt.year)
maxdata = maxdata.set_index(pd.to_datetime(maxdata.date).dt.year)
maxdata = maxdata.add_prefix('max_')
mindata = mindata.add_prefix('min_')
data = pd.concat([mindata, maxdata], axis=1)
return data
It is also desired that we have Annual min/max rank data so revisit the north and south
In [226]:
north_annual_by_day = do_annual_min_max_ranking(north, 'extent')
north_annual_averaged = do_annual_min_max_ranking(north, '5day-Avg')
In [227]:
south_annual_by_day = do_annual_min_max_ranking(south, 'extent')
south_annual_averaged = do_annual_min_max_ranking(south, '5day-Avg')
In [228]:
south_annual_averaged.head(3)
Out[228]:
Write out the data frames in a nice format
In [229]:
a = navg['min'].copy()
In [230]:
a.columns
Out[230]:
In [231]:
a.set_index(['rank', 'month']).unstack('month').head(3)
Out[231]:
In [232]:
import calendar
month_names = [calendar.month_name[x] for x in range(1,13)]
def swap_column_level_and_sort(df):
df.columns = df.columns.swaplevel(1,0)
df = df.sortlevel(0, axis=1)
return df
# set index to year and month and then broadcast month back across the columns.
# next swap and sort so that you have the data grouped under the month.
def prepare_for_csv(df):
df = df.set_index(['rank','month']).unstack('month')
df = swap_column_level_and_sort(df)
df.columns = df.columns.set_levels(month_names, level=0)
return df
def write_to_xls(df_list, writer, is_monthly=True):
for df, sheet in df_list:
if is_monthly:
df = prepare_for_csv(df)
df.to_excel(writer,'{sheet}'.format(sheet=sheet), float_format="%.3f")
In [233]:
writer = ExcelWriter('../output/Sea_Ice_MinMax_Statistics.xls')
In [234]:
monthly_dataframelist =[(navg['min'], 'Northern 5day Min'),
(navg['max'], 'Northern 5day Max'),
(savg['min'], 'Southern 5day Min'),
(savg['max'], 'Southern 5day Max'),
(n['min'], 'Northern Daily Min'),
(n['max'], 'Northern Daily Max'),
(s['min'], 'Southern Daily Min'),
(s['max'], 'Southern Daily Max')]
annual_dataframelist = [(north_annual_averaged, 'North Annual 5day-avg'),
(north_annual_by_day, 'North Annual Daily'),
(south_annual_averaged, 'South Annual 5day-avg'),
(south_annual_by_day, 'South Annual Daily')]
In [235]:
write_to_xls(monthly_dataframelist, writer, is_monthly=True)
In [236]:
write_to_xls(annual_dataframelist, writer, is_monthly=False)
writer.save()
In [237]:
b = prepare_for_csv(a)
In [238]:
b
Out[238]:
clean up your csv files
In [239]:
!cd ../data ; rm -f NH_seaice_extent_final.csv NH_seaice_extent_nrt.csv SH_seaice_extent_final.csv SH_seaice_extent_nrt.csv
In [239]: