In [1]:
# Import modules
import numpy as np
import gdal
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')
import os, sys
import gc
gc.collect()


Out[1]:
121

In [2]:
def raster2array(geotif_file):
    
    metadata = {}
    dataset = gdal.Open(geotif_file)
    metadata['array_rows'] = dataset.RasterYSize
    metadata['array_cols'] = dataset.RasterXSize
    metadata['bands'] = dataset.RasterCount
    metadata['driver'] = dataset.GetDriver().LongName
    metadata['projection'] = dataset.GetProjection()
    metadata['geotransform'] = dataset.GetGeoTransform()
    
    mapinfo = dataset.GetGeoTransform()
    metadata['pixelWidth'] = mapinfo[1]
    metadata['pixelHeight'] = mapinfo[5]

    metadata['ext_dict'] = {}
    metadata['ext_dict']['xMin'] = mapinfo[0]
    metadata['ext_dict']['xMax'] = mapinfo[0] + dataset.RasterXSize/mapinfo[1]
    metadata['ext_dict']['yMin'] = mapinfo[3] + dataset.RasterYSize/mapinfo[5]
    metadata['ext_dict']['yMax'] = mapinfo[3]
    
    metadata['extent'] = (metadata['ext_dict']['xMin'],metadata['ext_dict']['xMax'],
                          metadata['ext_dict']['yMin'],metadata['ext_dict']['yMax'])
    
    if metadata['bands'] == 1:
        raster = dataset.GetRasterBand(1)
        metadata['noDataValue'] = raster.GetNoDataValue()
        metadata['scaleFactor'] = raster.GetScale()
        
        # band statistics
        metadata['bandstats'] = {} #make a nested dictionary to store band stats in same 
        stats = raster.GetStatistics(True,True)
        metadata['bandstats']['min'] = round(stats[0],2)
        metadata['bandstats']['max'] = round(stats[1],2)
        metadata['bandstats']['mean'] = round(stats[2],2)
        metadata['bandstats']['stdev'] = round(stats[3],2)
        
        array = dataset.GetRasterBand(1).ReadAsArray(0,0,metadata['array_cols'],metadata['array_rows']).astype(np.float)
        array[array==metadata['noDataValue']]=np.nan
        array = array/metadata['scaleFactor']
        array = array[::-1] #inverse array because Python is column major
        return array, metadata
    
    elif metadata['bands'] > 1:
        print('More than one band ... need to modify function for case of multiple bands')
        
def array2raster(newRasterfn,rasterOrigin,pixelWidth,pixelHeight,array,epsg):

    cols = array.shape[1]
    rows = array.shape[0]
    originX = rasterOrigin[0]
    originY = rasterOrigin[1]

    driver = gdal.GetDriverByName('GTiff')
    outRaster = driver.Create(newRasterfn, cols, rows, 1, gdal.GDT_Byte)
    outRaster.SetGeoTransform((originX, pixelWidth, 0, originY, 0, pixelHeight))
    outband = outRaster.GetRasterBand(1)
    outband.WriteArray(array)
    outRasterSRS = osr.SpatialReference()
    outRasterSRS.ImportFromEPSG(epsg)
    outRaster.SetProjection(outRasterSRS.ExportToWkt())
    outband.FlushCache()

In [51]:
# Read in files as arrays
lidar_f = '/media/annie/AOP/DI_data/CHM.tif'
hs_f = '/media/annie/AOP/DI_data/NDVI.tif'

lidar_array, lidar_md = raster2array(lidar_f)
hs_array, hs_md = raster2array(hs_f)

In [52]:
# Flatten arrays for cdf function
lidar_array = lidar_array.flatten()
hs_array = hs_array.flatten()

In [53]:
lidar_samp = np.random.choice(lidar_array,500,replace=False)
hs_samp = np.random.choice(hs_array,500,replace=False)

In [54]:
# Calculate cumulative distribution
def cdf(array,out_name):
    # method 1
    test = array[~np.isnan(array)]

    H,X1 = np.histogram(test, bins = 100, normed=True)
    N = test.shape[0]
    dx = X1[1] - X1[0]
    F1 = np.cumsum(H)*dx
    #method 2
    X2 = np.sort(test) #variable sorted from lowest to highest
    F2 = np.array(range(N))/float(N) #cumulative percent of variable
    
    # output
    output_array = np.vstack((X2,F2))
    np.save(out_name,output_array)
        
    return(X1,F1,X2,F2)

In [63]:
# Run function and plot
X1,F1,X2,F2 = cdf(hs_samp,'testing_cdf')
X1b,F1b,X2b,F2b = cdf(lidar_samp,'testing_cdf')
X1n = (X1[1:]- np.min(X1))/(np.max(X1) - np.min(X1))
X1bn = (X1b[1:]- np.min(X1b))/(np.max(X1b) - np.min(X1b))
plt.plot(F1,X1n)
plt.plot(F1b,X1bn)
#plt.plot(F2, X2)
plt.show()



In [ ]:
# Combine four outputs to single array, each row is a variable