Water Quality CSV Transects joined by TIME/DATE


In [1]:
import os
import pandas
from scripts import wqt_timestamp_match
from datetime import datetime
from scripts import wq_gain

In [2]:
# set wd to Arcproject-wq-processing folder
wd = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname("__file__"))))

# example gain file
wq_file = os.path.join(wd, "scripts", "tests", "testfiles", "Arc_040413\\Arc_040413_WQ\\Arc_040413_wqp_cc1.csv")


print(wq_file)


C:\Users\Andy\Documents\arcproject-wq-processing\scripts\tests\testfiles\Arc_040413\Arc_040413_WQ\Arc_040413_wqp_cc1.csv

Import CSV file from the Sonde as Pandas Dataframe


In [3]:
wq_gain_df = wqt_timestamp_match.wq_from_file(wq_file)
print(wq_gain_df.head())


            Date_Time   Temp    pH SpCond DO_PCT    DO  DEP25  PAR RPAR  \
1 2013-04-04 11:49:52  17.24  7.82    523   79.3  6.32  -0.06  477  753   
2 2013-04-04 11:49:53  17.24  7.82    523   77.7  6.20  -0.04  482  759   
3 2013-04-04 11:49:54  17.24  7.82    523   77.7  6.20  -0.04  482  759   
4 2013-04-04 11:49:55  17.24  7.82    523   76.3  6.09  -0.03  469  766   
5 2013-04-04 11:49:56  17.24  7.82    523   76.3  6.09  -0.03  469  766   

  TurbSC   CHL CHL_VOLTS               WQ_SOURCE  
1   19.0  3.17   0.03520  Arc_040413_wqp_cc1.csv  
2   16.1  3.67   0.03882  Arc_040413_wqp_cc1.csv  
3   16.1  3.67   0.03882  Arc_040413_wqp_cc1.csv  
4   16.4  4.04   0.04266  Arc_040413_wqp_cc1.csv  
5   16.4  4.04   0.04266  Arc_040413_wqp_cc1.csv  

In [4]:
# Convert water quality columns to dtype floats

# if field is dtype string convert to numeric
for column in list(wq_gain_df.columns.values): 
    if wq_gain_df[column].dtype == object:
        wq_gain_df[column] = wq_gain_df[column].convert_objects(convert_numeric=True)

average top 1m of gain vertical profile


In [5]:
# Create variable with TRUE if depth is greater than 0 and less than 1
depth1m = (wq_gain_df['DEP25'] > 0)  & (wq_gain_df['DEP25'] < 1 ) 

# Select all cases where depth1m is TRUE
wq_gain_1m = wq_gain_df[depth1m]

print(wq_gain_1m)


             Date_Time   Temp    pH  SpCond  DO_PCT    DO  DEP25  PAR  RPAR  \
7  2013-04-04 11:49:58  17.27  7.83     523    75.5  6.02   0.07  470   772   
8  2013-04-04 11:49:59  17.27  7.83     523    75.5  6.02   0.07  530   780   
9  2013-04-04 11:50:00  17.28  7.82     523    74.4  5.93   0.16  399   786   
10 2013-04-04 11:50:01  17.28  7.82     523    74.4  5.93   0.16  399   786   
11 2013-04-04 11:50:02  17.29  7.81     523    74.3  5.92   0.24  199   795   
12 2013-04-04 11:50:03  17.29  7.81     523    74.3  5.92   0.24  199   795   
13 2013-04-04 11:50:04  17.28  7.82     523    74.3  5.92   0.45  143   802   
14 2013-04-04 11:50:05  17.28  7.82     523    74.3  5.92   0.45  143   802   
15 2013-04-04 11:50:06  17.26  7.81     523    74.3  5.93   0.63  105   805   
16 2013-04-04 11:50:07  17.26  7.81     523    74.3  5.93   0.63  105   805   
17 2013-04-04 11:50:08  17.26  7.82     523    74.4  5.93   0.77   72   816   
18 2013-04-04 11:50:09  17.26  7.82     523    74.4  5.93   0.77   72   816   
19 2013-04-04 11:50:10  17.25  7.81     523    74.4  5.94   0.92   52   824   
42 2013-04-04 11:50:33  17.24  7.80     524    71.7  5.72   0.31  278   782   
43 2013-04-04 11:50:34  17.24  7.80     524    71.7  5.72   0.31  278   782   
44 2013-04-04 11:50:35  17.25  7.81     524    72.1  5.75   0.14  352   776   
45 2013-04-04 11:50:36  17.25  7.81     524    72.1  5.75   0.14  352   776   
46 2013-04-04 11:50:37  17.25  7.81     524    72.5  5.79   0.13  380   774   
47 2013-04-04 11:50:38  17.25  7.81     524    72.5  5.79   0.13  380   774   
48 2013-04-04 11:50:39  17.26  7.82     524    72.9  5.81   0.13  400   769   

    TurbSC   CHL  CHL_VOLTS               WQ_SOURCE  
7     18.2  4.53    0.04352  Arc_040413_wqp_cc1.csv  
8     17.8  4.53    0.04368  Arc_040413_wqp_cc1.csv  
9     16.8  4.55    0.04337  Arc_040413_wqp_cc1.csv  
10    16.8  4.55    0.04337  Arc_040413_wqp_cc1.csv  
11    16.7  4.52    0.04339  Arc_040413_wqp_cc1.csv  
12    16.7  4.52    0.04339  Arc_040413_wqp_cc1.csv  
13    16.6  4.52    0.04383  Arc_040413_wqp_cc1.csv  
14    16.6  4.52    0.04383  Arc_040413_wqp_cc1.csv  
15    17.5  4.57    0.04405  Arc_040413_wqp_cc1.csv  
16    17.5  4.57    0.04405  Arc_040413_wqp_cc1.csv  
17    17.6  4.59    0.04409  Arc_040413_wqp_cc1.csv  
18    17.6  4.59    0.04409  Arc_040413_wqp_cc1.csv  
19    18.1  4.59    0.04342  Arc_040413_wqp_cc1.csv  
42    98.5  4.61    0.04550  Arc_040413_wqp_cc1.csv  
43    98.5  4.61    0.04550  Arc_040413_wqp_cc1.csv  
44    19.2  4.74    0.05918  Arc_040413_wqp_cc1.csv  
45    19.2  4.74    0.05918  Arc_040413_wqp_cc1.csv  
46    16.2  6.16    0.05678  Arc_040413_wqp_cc1.csv  
47    16.2  6.16    0.05678  Arc_040413_wqp_cc1.csv  
48    16.0  5.91    0.05523  Arc_040413_wqp_cc1.csv  

In [6]:
# get mean for depth1m
avg_1m = wq_gain_1m.mean()

print(avg_1m)

# convert series to dataframe
avg_1m_df = avg_1m.to_frame().transpose()
print(avg_1m_df)


Temp          17.263500
pH             7.814500
SpCond       523.350000
DO_PCT        73.715000
DO             5.878500
DEP25          0.342500
PAR          265.400000
RPAR         790.850000
TurbSC        25.415000
CHL            4.804000
CHL_VOLTS      0.047311
dtype: float64
      Temp      pH  SpCond  DO_PCT      DO   DEP25    PAR    RPAR  TurbSC  \
0  17.2635  7.8145  523.35  73.715  5.8785  0.3425  265.4  790.85  25.415   

     CHL  CHL_VOLTS  
0  4.804   0.047311  

Add gain setting and site to new column


In [7]:
# get date from the first row of the dataframe
date = wq_gain_df["Date_Time"][1]

print(date)
length = len(wq_gain_df.index)
end = wq_gain_df["Date_Time"][length]

print(end)


2013-04-04 11:49:52
2013-04-04 11:50:39

In [8]:
# add gain and site columns to results dataframe

avg_1m_df['Start_Time'] = date
avg_1m_df['End_Time'] = end
avg_1m_df['Site'] = "CC1"
avg_1m_df['Gain'] = "g0"

avg_1m_df


Out[8]:
Temp pH SpCond DO_PCT DO DEP25 PAR RPAR TurbSC CHL CHL_VOLTS Start_Time End_Time Site Gain
0 17.2635 7.8145 523.35 73.715 5.8785 0.3425 265.4 790.85 25.415 4.804 0.047311 2013-04-04 11:49:52 2013-04-04 11:50:39 CC1 g0

In [9]:
# example gain file
gain_shp = os.path.join(wd, "scripts", "tests", "testfiles", "Arc_040413\\Arc_040413_GPS\\040413_ZoopChlW.shp")

gain_df = wqt_timestamp_match.wqtshp2pd(gain_shp)

print(gain_df)


   Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0   bk1 2013-04-04  11:23:53am           0        Yes           0        Yes   
1   cc1 2013-04-04  11:48:19am           0        Yes           0        Yes   
2  lnca 2013-04-04  01:35:01pm           0        Yes           1        Yes   
3   ca3 2013-04-04  01:48:55pm           0        Yes           0        Yes   
4   hs1 2013-04-04  04:24:08pm           0        Yes           0        Yes   
5   ul1 2013-04-04  05:21:35pm           0        Yes           0        Yes   
6   ca1 2013-04-04  05:56:49pm           0        Yes           2        Yes   

  WaterQuali Flow_Meter Flow_Mete2    GPS_Date    GPS_Time     POINT_X  \
0        Yes    08027.0    08061.2  2013-04-04  11:23:42am -121.796677   
1        Yes    08062.5    08088.8  2013-04-04  11:48:08am -121.799515   
2        Yes     8088.8     8142.8  2013-04-04  01:34:51pm -121.689085   
3        Yes    08144.7    08219.1  2013-04-04  01:48:45pm -121.698867   
4        Yes    08217.9    08227.8  2013-04-04  04:23:58pm -121.740307   
5        Yes     8228.0     8233.9  2013-04-04  05:21:25pm -121.767254   
6        Yes    08232.8    08237.5  2013-04-04  05:56:39pm -121.760051   

     POINT_Y           GPS_SOURCE           Date_Time  
0  38.275293  040413_ZoopChlW.shp 2013-04-04 11:23:42  
1  38.259457  040413_ZoopChlW.shp 2013-04-04 11:48:08  
2  38.244226  040413_ZoopChlW.shp 2013-04-04 13:34:51  
3  38.266168  040413_ZoopChlW.shp 2013-04-04 13:48:45  
4  38.314881  040413_ZoopChlW.shp 2013-04-04 16:23:58  
5  38.291483  040413_ZoopChlW.shp 2013-04-04 17:21:25  
6  38.307271  040413_ZoopChlW.shp 2013-04-04 17:56:39  

Joins!


In [10]:
ex_avg = avg_1m_df

t1 = wqt_timestamp_match.wqtshp2pd(gain_shp)
print(t1)


   Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0   bk1 2013-04-04  11:23:53am           0        Yes           0        Yes   
1   cc1 2013-04-04  11:48:19am           0        Yes           0        Yes   
2  lnca 2013-04-04  01:35:01pm           0        Yes           1        Yes   
3   ca3 2013-04-04  01:48:55pm           0        Yes           0        Yes   
4   hs1 2013-04-04  04:24:08pm           0        Yes           0        Yes   
5   ul1 2013-04-04  05:21:35pm           0        Yes           0        Yes   
6   ca1 2013-04-04  05:56:49pm           0        Yes           2        Yes   

  WaterQuali Flow_Meter Flow_Mete2    GPS_Date    GPS_Time     POINT_X  \
0        Yes    08027.0    08061.2  2013-04-04  11:23:42am -121.796677   
1        Yes    08062.5    08088.8  2013-04-04  11:48:08am -121.799515   
2        Yes     8088.8     8142.8  2013-04-04  01:34:51pm -121.689085   
3        Yes    08144.7    08219.1  2013-04-04  01:48:45pm -121.698867   
4        Yes    08217.9    08227.8  2013-04-04  04:23:58pm -121.740307   
5        Yes     8228.0     8233.9  2013-04-04  05:21:25pm -121.767254   
6        Yes    08232.8    08237.5  2013-04-04  05:56:39pm -121.760051   

     POINT_Y           GPS_SOURCE           Date_Time  
0  38.275293  040413_ZoopChlW.shp 2013-04-04 11:23:42  
1  38.259457  040413_ZoopChlW.shp 2013-04-04 11:48:08  
2  38.244226  040413_ZoopChlW.shp 2013-04-04 13:34:51  
3  38.266168  040413_ZoopChlW.shp 2013-04-04 13:48:45  
4  38.314881  040413_ZoopChlW.shp 2013-04-04 16:23:58  
5  38.291483  040413_ZoopChlW.shp 2013-04-04 17:21:25  
6  38.307271  040413_ZoopChlW.shp 2013-04-04 17:56:39  

Attempt to join gain df to shapefile df using site name


In [11]:
# convert both site columns to UPPER
t1['Site'] = t1['Site'].str.upper()
ex_avg['Site'] = ex_avg['Site'].str.upper()
joined = pandas.merge(t1, ex_avg, how="inner", on="Site")

print(joined)


  Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0  CC1 2013-04-04  11:48:19am           0        Yes           0        Yes   

  WaterQuali Flow_Meter Flow_Mete2  ...       DO   DEP25    PAR    RPAR  \
0        Yes    08062.5    08088.8  ...   5.8785  0.3425  265.4  790.85   

   TurbSC    CHL  CHL_VOLTS          Start_Time            End_Time  Gain  
0  25.415  4.804   0.047311 2013-04-04 11:49:52 2013-04-04 11:50:39    g0  

[1 rows x 30 columns]

Attempt to join gain df to shapefile df using the closest time stamp to start or end time


In [12]:
# calculate the difference between the start time and the end time
mid_time = (ex_avg['Start_Time'] + (ex_avg['Start_Time'] - ex_avg['End_Time'])/2)[0]
print(mid_time)

# add column with difference between mid time and time associated with the sampling point dataframe
td = t1

td["TimeDelta"]=abs(td["Date_Time"]-mid_time) # absolute diff of time difference

print(td)


2013-04-04 11:49:28.500000
   Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0   BK1 2013-04-04  11:23:53am           0        Yes           0        Yes   
1   CC1 2013-04-04  11:48:19am           0        Yes           0        Yes   
2  LNCA 2013-04-04  01:35:01pm           0        Yes           1        Yes   
3   CA3 2013-04-04  01:48:55pm           0        Yes           0        Yes   
4   HS1 2013-04-04  04:24:08pm           0        Yes           0        Yes   
5   UL1 2013-04-04  05:21:35pm           0        Yes           0        Yes   
6   CA1 2013-04-04  05:56:49pm           0        Yes           2        Yes   

  WaterQuali Flow_Meter Flow_Mete2    GPS_Date    GPS_Time     POINT_X  \
0        Yes    08027.0    08061.2  2013-04-04  11:23:42am -121.796677   
1        Yes    08062.5    08088.8  2013-04-04  11:48:08am -121.799515   
2        Yes     8088.8     8142.8  2013-04-04  01:34:51pm -121.689085   
3        Yes    08144.7    08219.1  2013-04-04  01:48:45pm -121.698867   
4        Yes    08217.9    08227.8  2013-04-04  04:23:58pm -121.740307   
5        Yes     8228.0     8233.9  2013-04-04  05:21:25pm -121.767254   
6        Yes    08232.8    08237.5  2013-04-04  05:56:39pm -121.760051   

     POINT_Y           GPS_SOURCE           Date_Time       TimeDelta  
0  38.275293  040413_ZoopChlW.shp 2013-04-04 11:23:42 00:25:46.500000  
1  38.259457  040413_ZoopChlW.shp 2013-04-04 11:48:08 00:01:20.500000  
2  38.244226  040413_ZoopChlW.shp 2013-04-04 13:34:51 01:45:22.500000  
3  38.266168  040413_ZoopChlW.shp 2013-04-04 13:48:45 01:59:16.500000  
4  38.314881  040413_ZoopChlW.shp 2013-04-04 16:23:58 04:34:29.500000  
5  38.291483  040413_ZoopChlW.shp 2013-04-04 17:21:25 05:31:56.500000  
6  38.307271  040413_ZoopChlW.shp 2013-04-04 17:56:39 06:07:10.500000  

In [13]:
# join - using concat - the closest match with the water quality average df 

# reset index
td_closest = td.reset_index(drop=True)
ex_avg = ex_avg.reset_index(drop=True)

result = pandas.concat([td_closest, ex_avg ], axis=1, join='inner')
print(result) # there might be duplicate columns


   Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0   BK1 2013-04-04  11:23:53am           0        Yes           0        Yes   

  WaterQuali Flow_Meter Flow_Mete2  ...    DEP25    PAR    RPAR  TurbSC  \
0        Yes    08027.0    08061.2  ...   0.3425  265.4  790.85  25.415   

     CHL CHL_VOLTS          Start_Time            End_Time  Site  Gain  
0  4.804  0.047311 2013-04-04 11:49:52 2013-04-04 11:50:39   CC1    g0  

[1 rows x 32 columns]

In [15]:
diff = wq_gain.gain_gps_timediff(avg_1m_df, gain_df)

wq_gain.gain_gps_join_closest_timestamp(avg_1m_df, diff)



ValueErrorTraceback (most recent call last)
<ipython-input-15-060ca0e39118> in <module>()
      1 diff = wq_gain.gain_gps_timediff(avg_1m_df, gain_df)
      2 
----> 3 wq_gain.gain_gps_join_closest_timestamp(avg_1m_df, diff)

C:\Users\Andy\Documents\arcproject-wq-processing\scripts\wq_gain.py in gain_gps_join_closest_timestamp(gain_avg_df, gps_timediff)
     98 
     99 
--> 100         join = pd.concat([gps_closest_row, gain_avg_df], axis=1, join='inner', verify_integrity=True)
    101 
    102         # there might be duplicate columns

C:\Anaconda2\envs\esri104\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    752                        keys=keys, levels=levels, names=names,
    753                        verify_integrity=verify_integrity,
--> 754                        copy=copy)
    755     return op.get_result()
    756 

C:\Anaconda2\envs\esri104\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
    884         self.copy = copy
    885 
--> 886         self.new_axes = self._get_new_axes()
    887 
    888     def get_result(self):

C:\Anaconda2\envs\esri104\lib\site-packages\pandas\tools\merge.pyc in _get_new_axes(self)
    957                 new_axes[i] = ax
    958 
--> 959         new_axes[self.axis] = self._get_concat_axis()
    960         return new_axes
    961 

C:\Anaconda2\envs\esri104\lib\site-packages\pandas\tools\merge.pyc in _get_concat_axis(self)
   1014                                                   self.levels, self.names)
   1015 
-> 1016         self._maybe_check_integrity(concat_axis)
   1017 
   1018         return concat_axis

C:\Anaconda2\envs\esri104\lib\site-packages\pandas\tools\merge.pyc in _maybe_check_integrity(self, concat_index)
   1023                 overlap = concat_index.get_duplicates()
   1024                 raise ValueError('Indexes have overlapping values: %s'
-> 1025                                 % str(overlap))
   1026 
   1027 

ValueError: Indexes have overlapping values: ['Site']

Examples - wq_gain.main()


In [19]:
e1 = wq_gain.main(wq_file, gain_shp, "CC1", "G1")
print(e1)

e2 = wq_gain.main(wq_file, gain_shp, gain="g1")
print(e2)


  Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0  CC1 2013-04-04  11:48:19am           0        Yes           0        Yes   

  WaterQuali Flow_Meter Flow_Mete2  ...    DEP25    PAR    RPAR  TurbSC  \
0        Yes    08062.5    08088.8  ...   0.3425  265.4  790.85  25.415   

     CHL CHL_VOLTS          Start_Time            End_Time  \
0  4.804  0.047311 2013-04-04 11:49:52 2013-04-04 11:50:39   

                WQ_SOURCE  Gain  
0  Arc_040413_wqp_cc1.csv    G1  

[1 rows x 31 columns]
Joining using the closest timestamp.
  Site       Date        Time  Sample_Num Zooplankto  Number_of_ Chlorophyl  \
0  cc1 2013-04-04  11:48:19am           0        Yes           0        Yes   

  WaterQuali Flow_Meter Flow_Mete2  ...    DEP25    PAR    RPAR  TurbSC  \
0        Yes    08062.5    08088.8  ...   0.3425  265.4  790.85  25.415   

     CHL CHL_VOLTS          Start_Time            End_Time  \
0  4.804  0.047311 2013-04-04 11:49:52 2013-04-04 11:50:39   

                WQ_SOURCE  Gain  
0  Arc_040413_wqp_cc1.csv    g1  

[1 rows x 32 columns]