Created on Fri May 12 14:09:25 2017

Demonstrate how to loop (sequentially) through Argo mono-profile files loaded from the detailled index csv file

Traverse the index file by file:

for index, row in ai.iterrows():
    print row['file']

Traverse the index by chunk of N files:

N = 10
for k,rows in ai.groupby(np.arange(len(ai))//N):
    print rows['file']

Traverse the index by N chunk of files:

N = 3
for rows in np.array_split(ai,N):
    print rows['file']


ARGO DETAILLED INDEX HEADER AND 1ST LINES:

# Title : Profile directory file of the Argo Global Data Assembly Center
# Description : The directory file describes all individual profile files of the argo GDAC ftp site.
# Project : ARGO
# Format version : 2.1
# Date of update : 20160206015702
# FTP root number 1 : ftp://ftp.ifremer.fr/ifremer/argo/dac
# FTP root number 2 : ftp://usgodae.usgodae.org/pub/outgoing/argo/dac
# GDAC node : CORIOLIS
file,date,latitude,longitude,ocean,profiler_type,institution,date_update,profile_temp_qc,profile_psal_qc,profile_doxy_qc,ad_psal_adjustment_mean,ad_psal_adjustment_deviation,gdac_date_creation,gdac_date_update,n_levels
aoml/13857/profiles/R13857_001.nc,19970729200300,0.267,-16.032,A,845,AO,20080918131927,A,,,,,20080919024522,20080919024522,112
aoml/13857/profiles/R13857_002.nc,19970809192112,0.072,-17.659,A,845,AO,20080918131929,A,,,,,20080919024518,20080919024518,112
aoml/13857/profiles/R13857_003.nc,19970820184545,0.543,-19.622,A,845,AO,20080918131931,A,,,,,20080919024518,20080919024518,111

@author: gmaze """

Import and definition


In [2]:
import os
import pandas as pd
import numpy as np
from netCDF4 import Dataset, num2date
import multiprocessing
num_processes = multiprocessing.cpu_count()

# In[]:
def read_argoindex(index_file):
	"""
		Read the Argo detailled index txt file and return it as a Panda Dataframe
	"""
	return pd.read_csv(index_file,
	                 sep=',',index_col=None,header=0,skiprows=8,
	                 parse_dates=[1,7,13,14],
	                 dtype={'latitude':np.float32,'longitude':np.float32,
	                        'profiler_type':np.str,
	                        'profile_temp_qc':np.str,'profile_psal_qc':np.str,'profile_doxy_qc':np.str,
	                        'ad_psal_adjustment_mean':np.float32,'ad_psal_adjustment_deviation':np.float32,
	                       'n_levels':np.int})

def load_argoindex(droot,ifile):
	"""
		Load the Argo detailled index
		If read for the first time, the index is saved locally as hdf5 file from
		which it is loaded on new calls much faster.
	"""
	pre, ext = os.path.splitext(ifile)
	index = os.path.expanduser(os.path.join(droot,ifile))
	store = os.path.join(pre+'.h5')
	if not os.path.isfile(store):
		ai = read_argoindex(index)
		ai.to_hdf(store,'index')
		print ("Using Argo index file:\n\t%s")%(index)
	else:
		ai = pd.read_hdf(store, 'index')
		print ("Using Argo index file:\n\t%s")%(store)
	print (">> Loaded index for %i Argo files")%(ai.shape[0])
	return ai

Load the Argo index as a Dataframe


In [12]:
# Define where are the Argo data and the index file:
droot = '~/data/ARGO/DOI/10.12770/9d8ac2dc-6f4d-4379-8df2-714cab4a9ae7'
ifile = 'argo_profile_detailled_index.txt'

# Load the Argo index:
ai = load_argoindex('.',"argo_index_sample.txt") # Sample index with 20 rows
#ai = load_argoindex(droot,ifile) # Real index with about 1.5 millions of rows


Using Argo index file:
	argo_index_sample.h5
>> Loaded index for 20 Argo files

Option 1: Traverse the list of files 1 by 1


In [15]:
for k, row in ai.iterrows():
	print k, row['file']


0 aoml/13857/profiles/R13857_001.nc
1 aoml/13857/profiles/R13857_002.nc
2 aoml/13857/profiles/R13857_003.nc
3 aoml/13857/profiles/R13857_004.nc
4 aoml/13857/profiles/R13857_005.nc
5 aoml/13857/profiles/R13857_006.nc
6 aoml/13857/profiles/R13857_007.nc
7 aoml/13857/profiles/R13857_008.nc
8 aoml/13857/profiles/R13857_009.nc
9 aoml/13857/profiles/R13857_010.nc
10 aoml/13857/profiles/R13857_011.nc
11 aoml/13857/profiles/R13857_012.nc
12 aoml/13857/profiles/R13857_013.nc
13 aoml/13857/profiles/R13857_014.nc
14 aoml/13857/profiles/R13857_015.nc
15 aoml/13857/profiles/R13857_016.nc
16 aoml/13857/profiles/R13857_017.nc
17 aoml/13857/profiles/R13857_018.nc
18 aoml/13857/profiles/R13857_019.nc
19 aoml/13857/profiles/R13857_020.nc

Option 2: Traverse the index by chunk of N files


In [18]:
N = 3
for k,rows in ai.groupby(np.arange(len(ai))//N):
    print k
    print rows['file']


0
0    aoml/13857/profiles/R13857_001.nc
1    aoml/13857/profiles/R13857_002.nc
2    aoml/13857/profiles/R13857_003.nc
Name: file, dtype: object
1
3    aoml/13857/profiles/R13857_004.nc
4    aoml/13857/profiles/R13857_005.nc
5    aoml/13857/profiles/R13857_006.nc
Name: file, dtype: object
2
6    aoml/13857/profiles/R13857_007.nc
7    aoml/13857/profiles/R13857_008.nc
8    aoml/13857/profiles/R13857_009.nc
Name: file, dtype: object
3
9     aoml/13857/profiles/R13857_010.nc
10    aoml/13857/profiles/R13857_011.nc
11    aoml/13857/profiles/R13857_012.nc
Name: file, dtype: object
4
12    aoml/13857/profiles/R13857_013.nc
13    aoml/13857/profiles/R13857_014.nc
14    aoml/13857/profiles/R13857_015.nc
Name: file, dtype: object
5
15    aoml/13857/profiles/R13857_016.nc
16    aoml/13857/profiles/R13857_017.nc
17    aoml/13857/profiles/R13857_018.nc
Name: file, dtype: object
6
18    aoml/13857/profiles/R13857_019.nc
19    aoml/13857/profiles/R13857_020.nc
Name: file, dtype: object

Option 3: Traverse the index by N chunk of files


In [20]:
N = 3
for rows in np.array_split(ai,N):
    print rows['file']


0    aoml/13857/profiles/R13857_001.nc
1    aoml/13857/profiles/R13857_002.nc
2    aoml/13857/profiles/R13857_003.nc
3    aoml/13857/profiles/R13857_004.nc
4    aoml/13857/profiles/R13857_005.nc
5    aoml/13857/profiles/R13857_006.nc
6    aoml/13857/profiles/R13857_007.nc
Name: file, dtype: object
7     aoml/13857/profiles/R13857_008.nc
8     aoml/13857/profiles/R13857_009.nc
9     aoml/13857/profiles/R13857_010.nc
10    aoml/13857/profiles/R13857_011.nc
11    aoml/13857/profiles/R13857_012.nc
12    aoml/13857/profiles/R13857_013.nc
13    aoml/13857/profiles/R13857_014.nc
Name: file, dtype: object
14    aoml/13857/profiles/R13857_015.nc
15    aoml/13857/profiles/R13857_016.nc
16    aoml/13857/profiles/R13857_017.nc
17    aoml/13857/profiles/R13857_018.nc
18    aoml/13857/profiles/R13857_019.nc
19    aoml/13857/profiles/R13857_020.nc
Name: file, dtype: object

In [ ]: