A (climate) data manager example notebook

  • connect to (management) databases
  • do things on file system things
  • interact with databases
  • look into and check climate data
  • change metadata and associate data with identifiers
  • ...

Task0 : keep track of data ingest and calculate and check checksums


In [3]:
%load_ext sql


/home/stephan/anaconda/envs/scientific1/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/stephan/anaconda/envs/scientific1/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")

In [4]:
%sql sqlite:///ingest.db


Out[4]:
u'Connected: None@ingest.db'

In [17]:
%%bash
pwd
ls -al ~stephan/Repos/Data


/home/stephan/Repos/test
insgesamt 2893888
drwxrwxrwx 2 stephan stephan      4096 Sep  2 11:40 .
drwxr-xr-x 8 stephan stephan      4096 Sep  2 09:32 ..
-rwxr-x--- 1 stephan stephan 269357180 Sep  2 09:32 pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18500101-18591231.nc
-rwxr-x--- 1 stephan stephan 269430932 Sep  2 09:32 pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc
-rwxr-x--- 1 stephan stephan 269357180 Sep  2 09:32 pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18700101-18791231.nc
-rwxr-x--- 1 stephan stephan 269430672 Sep  2 09:32 psl_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc
-rwxr-x--- 1 stephan stephan 269356920 Sep  2 09:32 psl_day_MPI-ESM-LR_1pctCO2_r1i1p1_18700101-18791231.nc
-rwxr-x--- 1 stephan stephan 269430672 Sep  2 09:33 psl_day_MPI-ESM-LR_1pctCO2_r1i1p1_19800101-19891231.nc
-rwxr-x--- 1 stephan stephan 269356920 Sep  2 09:33 psl_day_MPI-ESM-LR_1pctCO2_r1i1p1_19900101-19991231.nc
-rwxr-x--- 1 stephan stephan 269357248 Sep  2 09:33 tasmin_day_MPI-ESM-LR_1pctCO2_r1i1p1_19700101-19791231.nc
-rwxr-x--- 1 stephan stephan 269431000 Sep  2 09:33 tasmin_day_MPI-ESM-LR_1pctCO2_r1i1p1_19800101-19891231.nc
-rwxr-x--- 1 stephan stephan 269357248 Sep  2 09:33 tasmin_day_MPI-ESM-LR_1pctCO2_r1i1p1_19900101-19991231.nc
-rwxr-x--- 1 stephan stephan 269430932 Sep  2 11:39 _tmp_file_999

In [12]:
%time chksum = generate_file_chksum('/home/stephan/Repos/Data/','_tmp_file_999',"md5")
print chksum


CPU times: user 458 ms, sys: 53.4 ms, total: 511 ms
Wall time: 522 ms
a9dceb7c95dc715cec503ae58b184298

In [16]:
result = %sql select * from ingest_files where checksum='md5:a9dceb7c95dc715cec503ae58b184298'

#print result
(aggregation_name,filename,ingest_date,checksum) = result[0]
print filename


Done.
pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc

In [ ]:
%time !md5sum /home/stephan/Repos/Data/pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc

In [ ]:
%time !sha256sum /home/stephan/Repos/Data/pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc

In [ ]:
%time chksum = generate_file_chksum('/home/stephan/Repos/Data/','pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc',"sha256")
print chksum

In [ ]:
# docu: https://code.zmaw.de/projects/cdo/embedded/index.html
from cdo import *
cdo = Cdo()

In [ ]:


In [ ]:
input1="/home/stephan/Data/pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18500101-18591231.nc"
input2="/home/stephan/Data/pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc"
cdo.showname(input=input1)

In [ ]:
cdo.griddes(input=input1)

In [ ]:
cdo.pardes(input=input1)
#cdo.griddes(input=input1)

In [ ]:
cdo.showstdname(input=input1)

In [ ]:
cdo.showname(input=input1)

In [ ]:
cdo.sinfo(input=input1)

In [ ]:
cdo.infon(input=input1)

In [ ]:
cdo.diff(input=(input1,input2))

In [ ]:
cdo.eca_cdd(input=input1,output="/home/stephan/Data/drydays.nc")

In [ ]:


In [11]:
import hashlib, os
def generate_file_chksum(rootdir, filename, cksumtype, blocksize=2**20):
    if cksumtype == "md5":
        m = hashlib.md5()
    elif cksumtype =="sha256":
        m = hashlib.sha256()
    with open( os.path.join(rootdir, filename) , "rb" ) as f:
        while True:
            buf = f.read(blocksize)
            if not buf:
                break
            m.update( buf )
    return m.hexdigest()

In [5]:
%%sql
CREATE TABLE ingest_files(aggregation_name, filename, date, checksum);
CREATE TABLE ingest_aggregations(origin,contact,location);


Done.
Done.
Out[5]:
[]

In [9]:
%%sql
INSERT INTO ingest_files VALUES('CMIP5/MPI-M/day/psl/','pr_day_MPI-ESM-LR_1pctCO2_r1i1p1_18600101-18691231.nc','2012-11-02',"md5:a9dceb7c95dc715cec503ae58b184298");
INSERT INTO ingest_aggregations VALUES('CMIP5/MPI-M/day/psl/','ho@mpi-m.de','/gpfs750/transfer/CMIP5');


1 rows affected.
1 rows affected.
Out[9]:
[]