In [ ]:
fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Dec/N_12_area.txt'
#fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Jun/N_06_area.txt'
#fn = '/Users/savoie/projects/monthly_sea_ice_extent_tools/source/data/csvify/Sep/N_09_area.txt'
In [ ]:
%matplotlib inline
import matplotlib.pylab
import pandas as pd
import numpy as np
import matplotlib as mpl
import datetime as dt
from scipy.stats import linregress
pd.options.display.mpl_style = 'default'
In [ ]:
def slurp_csv(filename):
with open(filename, 'r') as fp:
data = pd.read_csv(fp,
error_bad_lines=False,
warn_bad_lines=False,
skipinitialspace=True,
delimiter='\s+' )
data.rename(columns={'mo': 'month', "region":"hemisphere"}, inplace=True)
return data.dropna()
In [ ]:
data = slurp_csv(fn)
Set missing data to None Type.
In [ ]:
data.loc[ data.extent <= 0, ['extent', 'area', 'data_type']] = None
data = data.convert_objects(convert_numeric=True)
In [ ]:
data.dtypes
In [ ]:
a = data.copy()
Create an column of Date Periods and set it as the index.
In [ ]:
a['dates']= [pd.Period(dt.date(int(x[0]), int(x[1]), 1), "M") for x in zip(a['year'], a['month'])]
a = a.set_index('dates')
a['rank'] = pd.DataFrame(data=a['extent'].rank(ascending=1))
In [ ]:
a.head()
create new dataframe for the (rank sorted stuff)
In [ ]:
b = pd.DataFrame(index=a.index)
In [ ]:
#filler
b[' reordered => '] = " "
b['ordered_rank'] = pd.DataFrame(data=a['extent'].rank(ascending=1))
In [ ]:
b['ranked_year'] = a['year']
b['ranked_extent'] = a['extent']
In [ ]:
b.sort('ordered_rank', ascending=True, inplace=True)
compute extent anomaly
In [ ]:
climatological_mean = a[(a.year >= 1981) & (a.year <= 2010)].extent.mean()
In [ ]:
a['extent_anomaly'] = a.extent - climatological_mean
In [ ]:
a.head()
I can't get expanding apply to work with two values, and I can't cram them together so I'm going to have to loop over the indices.
In [ ]:
columns = ['trend_through_year_km2_per_year', 'p_value', 'r_value',
'stderr', 'Significant', '% trend_through_year']
for x in columns:
a[x] = None
a[x] = a[x].astype(np.float64)
In [ ]:
a.head()
Article on operational definition of Statistically Meaningful Trend (may or may not be correct in our case) http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3084280/
If one or several regressions concerning time and values in a time series, or time and mean values from intervals into which the series has been divided, yields r2≥0.65 and p≤0.05, then the time series is statistically meaningful.
In [ ]:
for i in range(0,len(a)):
cum_df = a.iloc[:i+1]
(slope, intercept, r_value, p_value, stderr) = linregress(cum_df['year'], cum_df['extent'])
a.ix[i, ['trend_through_year_km2_per_year', 'r_value', 'p_value', 'stderr']] = round(slope, 4) * 1000000, r_value, p_value, stderr
a.ix[i, '% trend_through_year'] = slope / climatological_mean * 10 * 100.
a.ix[i, 'Significant'] = (np.square(r_value) >= .65) & (p_value <= .05)
In [ ]:
# cram the two parts together
a = a.reset_index().join(b.reset_index(drop=True))
Fake the standard header stuff.
In [ ]:
In [ ]:
the_max = a[a['rank'] == a['rank'].max()]
the_min = a[a['rank'] == a['rank'].min()]
#print a[['dates', 'extent']][a['rank'] == a['rank'].min()]
this_year = a[a['year'] == 2014]
In [ ]:
this_date = this_year['dates'].values[0].strftime("%B %Y")
max_date = the_max['dates'].values[0].strftime("%B %Y")
min_date = the_min['dates'].values[0].strftime("%B %Y")
In [ ]:
the_extent = this_year['extent'].values[0]
the_rank = this_year['rank'].values[0]
the_trend = this_year['trend_through_year_km2_per_year'].values[0]
the_pct_trend = this_year['% trend_through_year'].values[0]
In [ ]:
print the_extent- the_max.extent.values[0]
In [ ]:
print('{0} extent, {1:.2f} Mkm^2'.format(this_date, the_extent))
print('June 1981-2010 mean extent, {0:.2f} Mkm^2'.format(climatological_mean))
print('{0} - June 1981-2010, {1:0.0f} km^2'.format(this_date, (the_extent - round(climatological_mean,2))*1000000))
print('{0} (rank), {1:.1f}, {2} higher, {3} lower'.format(this_date, the_rank, len(a)-the_rank, the_rank-1))
print('{0} (max), {1:.2f} Mkm^2, diff, {2:.0f} km^2'.format(max_date, the_max.extent.values[0], (the_extent - the_max.extent.values[0])*1000000))
print('{0} (min), {1:.2f} Mkm^2, diff, {2:.0f} km^2'.format(min_date, the_min.extent.values[0], (the_extent - the_min.extent.values[0])*1000000))
print('{0} trend {1:.2f} percent/decade'.format(this_date, the_pct_trend))
print('{0} trend {1:.0f} percent/decade'.format(this_date, the_trend))
In [ ]:
with open('../output/test.csv', 'w') as fp:
fp.write('{0} extent, {1:.2f} Mkm^2\n'.format(this_date, the_extent))
fp.write('June 1981-2010 mean extent, {0:.2f} Mkm^2\n'.format(climatological_mean))
fp.write('{0} - June 1981-2010, {1:0.0f} km^2\n'.format(this_date, (the_extent - round(climatological_mean,2))*1000000))
fp.write('{0} (rank), {1:.1f}, {2} higher, {3} lower\n'.format(this_date, the_rank, len(a)-the_rank, the_rank-1))
fp.write('{0} (max), {1:.2f} Mkm^2, diff, {2:.0f} km^2\n'.format(max_date, the_max.extent.values[0], (the_extent - the_max.extent.values[0])*1000000))
fp.write('{0} (min), {1:.2f} Mkm^2, diff, {2:.0f} km^2\n'.format(min_date, the_min.extent.values[0], (the_extent - the_min.extent.values[0])*1000000))
fp.write('{0} trend, {1:.2f} percent/decade\n'.format(this_date, the_pct_trend))
fp.write('{0} trend, {1:.0f} percent/decade\n'.format(this_date, the_trend))
a.to_csv(fp, header=True, float_format="%3.2f", index=False )
In [ ]:
a.reset_index(drop=True).head()
In [ ]: