Notebook to work with Altimetry and Lake Surface Area



In [698]:

    
% matplotlib inline
import pandas as pd
import glob
import matplotlib.pyplot as plt



In [699]:

    
GRLM = "345_GRLM10.txt"; print GRLM
df_grlm = pd.read_csv(GRLM, skiprows=43, delim_whitespace=True, names="mission,cycle,date,hour,minute,lake_height,error,mean(decibels),IonoCorrection,TropCorrection".split(","), engine='python', index_col=False)
df_grlm.head(5)









    



345_GRLM10.txt






    Out[699]:







  
    
      
      mission
      cycle
      date
      hour
      minute
      lake_height
      error
      mean(decibels)
      IonoCorrection
      TropCorrection
    
  
  
    
      0
      OSTM
      1
      20080712
      23
      24
      999.99
      99.999
      999.99
      AMR
      GIM
    
    
      1
      OSTM
      2
      20080722
      21
      23
      -0.49
      0.054
      47.27
      AMR
      GIM
    
    
      2
      OSTM
      3
      20080801
      19
      21
      -0.53
      0.055
      43.11
      AMR
      GIM
    
    
      3
      OSTM
      4
      20080811
      17
      20
      -0.83
      0.054
      43.55
      AMR
      GIM
    
    
      4
      OSTM
      5
      20080821
      15
      18
      -1.05
      0.061
      42.62
      AMR
      GIM

GRLM Altimetry data from July 22 2008 to September 3, 2016

Create new columns of year, month, day in a convenient format



In [700]:

    
df_grlm = pd.read_csv(GRLM, skiprows=43, delim_whitespace=True, names="mission,cycle,date,hour,minute,lake_height,error,mean(decibels),IonoCorrection,TropCorrection".split(","), engine='python', index_col=False)
def get_year(date): return int(str(date)[0:4])
def get_month(date): return int(str(date)[4:6])
def get_day(date): return int(str(date)[6:])
df_grlm['year'] = df_grlm['date'].apply(get_year)
df_grlm['month'] = df_grlm['date'].apply(get_month)
df_grlm['day'] = df_grlm['date'].apply(get_day)
df_grlm = df_grlm.where(df_grlm.minute < 61 ) # remove lines that do not have time
df_grlm = df_grlm.where(df_grlm.lake_height < 900 ) # remove entries that do not have lake-height



In [701]:

    
df_grlm.lake_height.plot(); plt.title("Actual data without resampling"); plt.ylabel("Variation (m)")









    Out[701]:





<matplotlib.text.Text at 0x7f134b6dec10>

Interpolate the missing data points



In [702]:

    
df_grlm.lake_height.interpolate().plot(); plt.title("Interpolated Actual data without resampling"); plt.ylabel("Variation (m)")









    Out[702]:





<matplotlib.text.Text at 0x7f134b52a650>

Add time information to the dataframe



In [703]:

    
df = df_grlm
df[["year", "month", "day", "hour", "minute"]] = df[["year", "month", "day", "hour", "minute"]].fillna(0).astype(int)
df['Time'] = df.year.astype(str).str.cat(df.month.astype(str).astype(str), sep='-').str.cat(df.day.astype(str), sep='-')\
                .str.cat(df.hour.astype(str).astype(str), sep='-').str.cat(df.minute.astype(str).astype(str), sep='-')
df = df.where(df.year>10) # to ger rid of all the nan values
df.index = pd.to_datetime(pd.Series(df["Time"]), format="%Y-%m-%d-%H-%M"); 
print df.index[0:3], df.index[-3:]









    



DatetimeIndex(['NaT', '2008-07-22 21:23:00', '2008-08-01 19:21:00'], dtype='datetime64[ns]', name=u'Time', freq=None) DatetimeIndex(['2016-09-03 16:02:00', 'NaT', 'NaT'], dtype='datetime64[ns]', name=u'Time', freq=None)

Resample the data to get monthly and annual variation in lake height



In [704]:

    
df["lake_height"].resample("M").mean().plot(); plt.title("Mean Monthly Altimetry"); plt.ylabel("Variation (m)")









    Out[704]:





<matplotlib.text.Text at 0x7f134a5edb90>



In [705]:

    
df["lake_height"].resample("A").mean().plot(); plt.title("Mean Annual Altimetry"); plt.ylabel("Variation (m)")









    Out[705]:





<matplotlib.text.Text at 0x7f134a526b50>

MODIS data Lake Surface Area (Feb 18, 2000 to Aug 13, 2015)



In [706]:

    
df_modis = pd.read_csv('MODIS_t.txt', names=["Area"], engine='python', index_col=False)
df_time = pd.read_csv('DV.txt', sep = "\t", names=["Year", "Month", "Day", "", "", ""], engine='python', index_col=False)
df_time['Time'] = df_time.Year.astype(str).str.cat(df_time.Month.astype(str).astype(str), sep='-').str.cat(df_time.Day.astype(str), sep='-')
df_time = df_time.where(df_time.Year>10) # to ger rid of all the nan values
df_modis.index = pd.to_datetime(pd.Series(df_time["Time"]), format="%Y-%m-%d")#df.index[0:3]



In [707]:

    
df_modis.plot(); plt.title("MODIS data - Surface Area"); plt.ylabel("Surface Area (sq.m.?)")









    Out[707]:





<matplotlib.text.Text at 0x7f134a7f00d0>

Create subsets of both vectors (altimetry and surface area) for the overlapping period



In [708]:

    
df_glrm_subset = df["lake_height"].resample("D").mean().interpolate()
df_glrm_subset = df_glrm_subset[(df_glrm_subset.index > '2008-07-22') & (df_glrm_subset.index <= '2015-08-13')]
df_glrm_subset.plot(); plt.legend(); plt.title("Subset of Altimetry"); plt.ylabel("Variation (m)")
df_glrm_subset.index









    Out[708]:





DatetimeIndex(['2008-07-23', '2008-07-24', '2008-07-25', '2008-07-26',
               '2008-07-27', '2008-07-28', '2008-07-29', '2008-07-30',
               '2008-07-31', '2008-08-01',
               ...
               '2015-08-04', '2015-08-05', '2015-08-06', '2015-08-07',
               '2015-08-08', '2015-08-09', '2015-08-10', '2015-08-11',
               '2015-08-12', '2015-08-13'],
              dtype='datetime64[ns]', name=u'Time', length=2578, freq=None)



In [709]:

    
df_modis_daily = df_modis["Area"].resample("D").mean().interpolate()
df_modis_subset = df_modis_daily[(df_modis_daily.index > '2008-07-22') & (df_modis_daily.index <= '2015-08-13')]
df_modis_subset.plot()
df_modis_subset.index









    Out[709]:





DatetimeIndex(['2008-07-23', '2008-07-24', '2008-07-25', '2008-07-26',
               '2008-07-27', '2008-07-28', '2008-07-29', '2008-07-30',
               '2008-07-31', '2008-08-01',
               ...
               '2015-08-04', '2015-08-05', '2015-08-06', '2015-08-07',
               '2015-08-08', '2015-08-09', '2015-08-10', '2015-08-11',
               '2015-08-12', '2015-08-13'],
              dtype='datetime64[ns]', name=u'Time', length=2578, freq='D')



In [710]:

    
# QA: Create a time series of time alone, to check the number of data points that we should have for days. 
#Note the vaiable called length
print pd.date_range('22/07/2008', periods=len(df_modis_subset), freq='D')









    



DatetimeIndex(['2008-07-22', '2008-07-23', '2008-07-24', '2008-07-25',
               '2008-07-26', '2008-07-27', '2008-07-28', '2008-07-29',
               '2008-07-30', '2008-07-31',
               ...
               '2015-08-03', '2015-08-04', '2015-08-05', '2015-08-06',
               '2015-08-07', '2015-08-08', '2015-08-09', '2015-08-10',
               '2015-08-11', '2015-08-12'],
              dtype='datetime64[ns]', length=2578, freq='D')



In [711]:

    
# Check if the two vectors are of the same length
print len(df_glrm_subset.tolist()), len(df_modis_subset.tolist())

Compute correlation coefficient



In [712]:

    
import numpy
cor = numpy.corrcoef(df_glrm_subset.resample("W").mean().interpolate().tolist(), 
                     df_modis_subset.resample("W").mean().interpolate().tolist())
print "correlation coefficient is: " , cor[0][1]









    



correlation coefficient is:  -0.612126394148

	mission	cycle	date	hour	minute	lake_height	error	mean(decibels)	IonoCorrection	TropCorrection
0	OSTM	1	20080712	23	24	999.99	99.999	999.99	AMR	GIM
1	OSTM	2	20080722	21	23	-0.49	0.054	47.27	AMR	GIM
2	OSTM	3	20080801	19	21	-0.53	0.055	43.11	AMR	GIM
3	OSTM	4	20080811	17	20	-0.83	0.054	43.55	AMR	GIM
4	OSTM	5	20080821	15	18	-1.05	0.061	42.62	AMR	GIM