In [ ]:
fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Dec/N_12_area.txt'
#fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Jun/N_06_area.txt'
#fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Sep/N_09_area.txt'

In [ ]:
%matplotlib inline
import matplotlib.pylab
import pandas as pd
import numpy as np
import matplotlib as mpl
import datetime as dt
from scipy.stats import linregress
pd.options.display.mpl_style = 'default'

In [ ]:
def slurp_csv(filename):
    with open(filename, 'r') as fp:
        data = pd.read_csv(fp,
                           error_bad_lines=False,
                           warn_bad_lines=False,
                           skipinitialspace=True,
                           delimiter='\s+' )
    data.rename(columns={'mo': 'month', "region":"hemisphere"}, inplace=True)
    return data.dropna()

In [ ]:
data = slurp_csv(fn)

Set missing data to None Type.


In [ ]:
data.loc[ data.extent <= 0, ['extent', 'area', 'data_type']] = None
data = data.convert_objects(convert_numeric=True)

In [ ]:
data.dtypes

In [ ]:
a = data.copy()

Create an column of Date Periods and set it as the index.


In [ ]:
a['dates']= [pd.Period(dt.date(int(x[0]), int(x[1]), 1), "M") for x in zip(a['year'], a['month'])]
a = a.set_index('dates')

a['rank'] = pd.DataFrame(data=a['extent'].rank(ascending=1))

In [ ]:
a.head()

create new dataframe for the (rank sorted stuff)


In [ ]:
b = pd.DataFrame(index=a.index)

In [ ]:
#filler
b[' reordered => '] = " "

b['ordered_rank'] = pd.DataFrame(data=a['extent'].rank(ascending=1))

In [ ]:
b['ranked_year'] = a['year']
b['ranked_extent'] = a['extent']

In [ ]:
b.sort('ordered_rank', ascending=True, inplace=True)

compute extent anomaly


In [ ]:
climatological_mean = a[(a.year >= 1981) & (a.year <= 2010)].extent.mean()

In [ ]:
a['extent_anomaly'] = a.extent - climatological_mean

In [ ]:
a.head()

I can't get expanding apply to work with two values, and I can't cram them together so I'm going to have to loop over the indices.


In [ ]:
columns = ['trend_through_year_km2_per_year', 'p_value', 'r_value',
           'stderr', 'Significant', '% trend_through_year']

for x in columns:
    a[x] = None
    a[x] = a[x].astype(np.float64)

In [ ]:
a.head()

Article on operational definition of Statistically Meaningful Trend (may or may not be correct in our case) http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3084280/

If one or several regressions concerning time and values in a time series, or time and mean values from intervals into which the series has been divided, yields r2≥0.65 and p≤0.05, then the time series is statistically meaningful.


In [ ]:
for i in range(0,len(a)):
    cum_df = a.iloc[:i+1]
    (slope, intercept, r_value, p_value, stderr) = linregress(cum_df['year'], cum_df['extent'])
    a.ix[i, ['trend_through_year_km2_per_year', 'r_value', 'p_value', 'stderr']] = round(slope, 4) * 1000000, r_value, p_value, stderr
    a.ix[i, '% trend_through_year'] =  slope / climatological_mean * 10 * 100.
    a.ix[i, 'Significant'] = (np.square(r_value) >= .65) & (p_value <= .05)

In [ ]:
# cram the two parts together 
a = a.reset_index().join(b.reset_index(drop=True))

Fake the standard header stuff.


In [ ]:


In [ ]:
the_max = a[a['rank'] == a['rank'].max()]
the_min = a[a['rank'] == a['rank'].min()]
#print a[['dates', 'extent']][a['rank'] == a['rank'].min()]
this_year =  a[a['year'] == 2014]

In [ ]:
this_date = this_year['dates'].values[0].strftime("%B %Y")
max_date = the_max['dates'].values[0].strftime("%B %Y")
min_date = the_min['dates'].values[0].strftime("%B %Y")

In [ ]:
the_extent = this_year['extent'].values[0]
the_rank = this_year['rank'].values[0]
the_trend = this_year['trend_through_year_km2_per_year'].values[0]
the_pct_trend = this_year['% trend_through_year'].values[0]

In [ ]:
print the_extent- the_max.extent.values[0]

In [ ]:
print('{0} extent, {1:.2f} Mkm^2'.format(this_date, the_extent))
print('June 1981-2010 mean extent, {0:.2f} Mkm^2'.format(climatological_mean))
print('{0} - June 1981-2010, {1:0.0f} km^2'.format(this_date, (the_extent - round(climatological_mean,2))*1000000))
print('{0} (rank), {1:.1f},  {2} higher, {3} lower'.format(this_date, the_rank, len(a)-the_rank, the_rank-1))
print('{0} (max), {1:.2f} Mkm^2, diff, {2:.0f} km^2'.format(max_date, the_max.extent.values[0], (the_extent - the_max.extent.values[0])*1000000))
print('{0} (min), {1:.2f} Mkm^2, diff, {2:.0f} km^2'.format(min_date, the_min.extent.values[0], (the_extent - the_min.extent.values[0])*1000000))
print('{0} trend {1:.2f} percent/decade'.format(this_date, the_pct_trend))
print('{0} trend {1:.0f} percent/decade'.format(this_date, the_trend))

In [ ]:
with open('../output/test.csv', 'w') as fp:
    fp.write('{0} extent, {1:.2f} Mkm^2\n'.format(this_date, the_extent))
    fp.write('June 1981-2010 mean extent, {0:.2f} Mkm^2\n'.format(climatological_mean))
    fp.write('{0} - June 1981-2010, {1:0.0f} km^2\n'.format(this_date, (the_extent - round(climatological_mean,2))*1000000))
    fp.write('{0} (rank), {1:.1f},  {2} higher, {3} lower\n'.format(this_date, the_rank, len(a)-the_rank, the_rank-1))
    fp.write('{0} (max), {1:.2f} Mkm^2, diff, {2:.0f} km^2\n'.format(max_date, the_max.extent.values[0], (the_extent - the_max.extent.values[0])*1000000))
    fp.write('{0} (min), {1:.2f} Mkm^2, diff, {2:.0f} km^2\n'.format(min_date, the_min.extent.values[0], (the_extent - the_min.extent.values[0])*1000000))
    fp.write('{0} trend, {1:.2f} percent/decade\n'.format(this_date, the_pct_trend))
    fp.write('{0} trend, {1:.0f} percent/decade\n'.format(this_date, the_trend))
    a.to_csv(fp, header=True, float_format="%3.2f", index=False )

In [ ]:
a.reset_index(drop=True).head()

In [ ]: