GenFeatures Test


For the speedup

Extracting District Names into 9 Dictionaries

Because there are only nine different scene locations in the downloaded dataset.

Or if it was way more than 9, then could have sorted the file names first, then only the change of scene location, construct a new dictionary ...


In [1]:
from osgeo import ogr, osr, gdal

import fiona
from shapely.geometry import Point, shape

import numpy as np
import pandas as pd

import os
import sys
import tarfile
import timeit

In [2]:
# Change this for Win7,macOS
bases = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census\Dist.shp"
# base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
fc = fiona.open(bases)

In [3]:
def reverse_geocode(pt):
    for feature in fc:
        if shape(feature['geometry']).contains(pt):
            return feature['properties']['DISTRICT']
    return "NRI"

In [4]:
base2 = "G:\BTP\Satellite\Data\Test"  # Win7
base = "G:\BTP\Satellite\Data\Test2"  # Win7

In [5]:
def extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    if os.path.isdir(os.path.join(base,root)) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s' % (root, filename))
    else:
        print('Extracting data for %s' % root)
        tar = tarfile.open(os.path.join(base,filename))
        sys.stdout.flush()
        tar.extractall(os.path.join(base,root))
        tar.close()

In [6]:
# extracting for Test2
for directory, subdirList, fileList in os.walk(base):
    for filename in fileList:
        if filename.endswith(".tar.gz"): 
            d = extract(filename)


LE07_L1GT_146039_20050702_20170115_01_T2 already present - Skipping extraction of LE07_L1GT_146039_20050702_20170115_01_T2.tar.gz
LE07_L1GT_146040_20040512_20170120_01_T2 already present - Skipping extraction of LE07_L1GT_146040_20040512_20170120_01_T2.tar.gz
LE07_L1GT_146041_20041222_20170117_01_T2 already present - Skipping extraction of LE07_L1GT_146041_20041222_20170117_01_T2.tar.gz
LE07_L1GT_147039_20050725_20170114_01_T2 already present - Skipping extraction of LE07_L1GT_147039_20050725_20170114_01_T2.tar.gz
LE07_L1GT_147040_20050506_20170116_01_T2 already present - Skipping extraction of LE07_L1GT_147040_20050506_20170116_01_T2.tar.gz
LE07_L1GT_147041_20040807_20170119_01_T2 already present - Skipping extraction of LE07_L1GT_147041_20040807_20170119_01_T2.tar.gz
LE07_L1GT_148039_20050918_20170113_01_T2 already present - Skipping extraction of LE07_L1GT_148039_20050918_20170113_01_T2.tar.gz
LE07_L1GT_148040_20040627_20170120_01_T2 already present - Skipping extraction of LE07_L1GT_148040_20040627_20170120_01_T2.tar.gz
LE07_L1GT_149039_20050128_20170117_01_T2 already present - Skipping extraction of LE07_L1GT_149039_20050128_20170117_01_T2.tar.gz

In [7]:
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories

In [8]:
ds = gdal.Open(base2 + "\LE07_L1TP_146039_20101223_20161211_01_T1\LE07_L1TP_146039_20101223_20161211_01_T1_B1.TIF")

In [9]:
type(ds)


Out[9]:
osgeo.gdal.Dataset

Prepare one ds variable here itself, for the transformation of the coordinate system below.


In [10]:
# create the new coordinate system
wgs84_wkt = """
GEOGCS["WGS 84",
    DATUM["WGS_1984",
        SPHEROID["WGS 84",6378137,298.257223563,
            AUTHORITY["EPSG","7030"]],
        AUTHORITY["EPSG","6326"]],
    PRIMEM["Greenwich",0,
        AUTHORITY["EPSG","8901"]],
    UNIT["degree",0.01745329251994328,
        AUTHORITY["EPSG","9122"]],
    AUTHORITY["EPSG","4326"]]"""
new_cs = osr.SpatialReference()
new_cs.ImportFromWkt(wgs84_wkt)


Out[10]:
0

In [11]:
def func(dsx):
    old_cs= osr.SpatialReference()
    old_cs.ImportFromWkt(dsx.GetProjectionRef())
    trs = osr.CoordinateTransformation(old_cs,new_cs) 
    return trs

In [12]:
def pixel2coord(x, y, xoff, a, b, yoff, d, e):
    """Returns global coordinates from coordinates x,y of the pixel"""
    xp = a * x + b * y + xoff
    yp = d * x + e * y + yoff
    return(xp, yp)

In [ ]:


In [28]:
dicts = [[],[],[],[],[],[],[],[],[]]
dictr = [(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0)]
dicti = {"146039":0, "146040":1, "146041":2, "147039":3, "147040":4, "147041":5, "148039":6, "148040":7, "149039":8}

In [29]:
k = 50

In [30]:
stx = timeit.default_timer()

for directory in directories:
    
    """ Identifying Month, Year, Spacecraft ID """
    date = directory.split('\\')[-1].split('_')[3] # Change for Win7
    satx = directory.split('\\')[-1][3]
    month = date[4:6]
    year = date[0:4]
    
    scene = directory.split('\\')[-1].split('_')[2]
    index = dicti[scene]
    
    #if index != 1: continue
    
    """ Visiting every GeoTIFF file """ 
    for _,_,files in os.walk(directory):
        for filename in files:
            
            if filename.endswith(".TIF"):
                
                if filename[-5] == '2': break
                
                print os.path.join(directory,filename)
                
                ds = gdal.Open(os.path.join(directory,filename))
                if ds == None: continue
                col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
                xoff, a, b, yoff, d, e = ds.GetGeoTransform()
                #--------------------
                transform = func(ds)
                #--------------------
                
                dictr[index] = (col,row)
                
                """ Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
                for i in range(0,col,col/k):
                    for j in range(0,row,row/k):
                        
                        ########### fetching the lat and lon coordinates 
                        x,y = pixel2coord(i, j, xoff, a, b, yoff, d, e)
                        lonx, latx, z = transform.TransformPoint(x,y)
                        
                        
                        ########### fetching the name of district
                        
                        point = Point(lonx,latx)
                        district = reverse_geocode(point)
                        
                        dicts[index].append(district)
                        
#                         #----------------------------------------------------------
#                         if filename[-5] == '1':
#                             point = Point(lonx,latx)
#                             district = reverse_geocode(point)
#                             dicts[i][str(lonx)+str(latx)] = district
#                         else:
#                             print "-------------------- !!!!!!! -------------------"
#                             district = dictx[str(lonx)+str(latx)]
#                         #----------------------------------------------------------
                        
            
                            
elapsed = timeit.default_timer() - stx
print (elapsed)
print "Seconds"


G:\BTP\Satellite\Data\Test2\LE07_L1GT_146039_20050702_20170115_01_T2\LE07_L1GT_146039_20050702_20170115_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_146040_20040512_20170120_01_T2\LE07_L1GT_146040_20040512_20170120_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_146041_20041222_20170117_01_T2\LE07_L1GT_146041_20041222_20170117_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147039_20050725_20170114_01_T2\LE07_L1GT_147039_20050725_20170114_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147040_20050506_20170116_01_T2\LE07_L1GT_147040_20050506_20170116_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147041_20040807_20170119_01_T2\LE07_L1GT_147041_20040807_20170119_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_148039_20050918_20170113_01_T2\LE07_L1GT_148039_20050918_20170113_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_148040_20040627_20170120_01_T2\LE07_L1GT_148040_20040627_20170120_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_149039_20050128_20170117_01_T2\LE07_L1GT_149039_20050128_20170117_01_T2_B1.TIF
22027.7967823
Seconds

In [31]:
print dictr


[(7981, 7271), (7741, 7001), (7761, 7021), (7811, 7051), (7801, 7071), (7821, 7051), (7871, 7111), (7861, 7131), (7941, 7181)]

In [32]:
print latx
print lonx
print district


29.3540173718
74.8772487691
Hanumangarh

In [34]:
for i in range(9):
    print len(dicts[i])


2601
2601
2601
2601
2601
2601
2601
2601
2601

In [47]:
# print dicts[0]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [80]:
base = "G:\BTP\Satellite\Data\Bulk\Landsat"

In [69]:
# base = "G:\BTP\Satellite\Data\Test"

In [81]:
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories

In [66]:
ricep = pd.read_csv("C:\Users\deepak\Desktop\Repo\BTP\Satellite\Ricep_large.csv")
ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep.head()


Out[66]:
State_Name ind_district Crop_Year Season Crop Area Production Value X1 X2
0 Chandigarh chandigarh 2004 kharif Rice 80 400.0 5.0 500.0 700.0
1 Chandigarh chandigarh 2005 kharif Rice 50 250.0 5.0 400.0 500.0
2 Chandigarh chandigarh 2006 kharif Rice 50 250.0 5.0 250.0 400.0
3 Chandigarh chandigarh 2007 kharif Rice 50 250.0 5.0 250.0 250.0
4 Chandigarh chandigarh 2008 kharif Rice 20 100.0 5.0 250.0 250.0

In [67]:
a = np.empty((ricep.shape[0],1))*np.NAN

In [68]:
""" 'features' contain collumn indexes for the new features """
""" 'dictn' is the dictionary mapping name of collumn index to the index number """
features = []
dictn = {}
k = 10
for i in range(1,13):
    for j in range(1,11):
        s = str(i) + "_B" + str(j) + "_"
        features.append(s+"M")
        features.append(s+"V")
        dictn[s+"M"] = k
        dictn[s+"V"] = k+1
        k = k+2

In [71]:
for i in range(1,13):
    for j in range(1,11):
        s = str(i) + "_B" + str(j) + "_"
        features.append(s+"Mn")
        features.append(s+"Vn")

In [85]:
tmp = pd.DataFrame(index=range(ricep.shape[0]),columns=features)
ricex = pd.concat([ricep,tmp], axis=1)

In [86]:
k = 50

In [87]:
stx = timeit.default_timer()

for directory in directories:
    
#     if bx: continue
#     else: bx = True
    
    """ Identifying Month, Year, Spacecraft ID """
    date = directory.split('\\')[-1].split('_')[3] # Change for Win7
    satx = directory.split('\\')[-1][3]
    month = date[4:6]
    year = date[0:4]
    
    scene = directory.split('\\')[-1].split('_')[2]
    index = dicti[scene]
    
    """ Visiting every GeoTIFF file """ 
    for _,_,files in os.walk(directory):
        for filename in files:
            
            
            if filename.endswith(".TIF"):
                
                if filename[-5] == '8': continue
                        
                ds = gdal.Open(os.path.join(directory,filename))
                if ds == None: continue
                col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
                xoff, a, b, yoff, d, e = ds.GetGeoTransform()
                
                ind = -1
                for i in range(0,col,col/k):
                    for j in range(0,row,row/k):
                        
                        ind += 1
                        if ind>2600: break
                            
                        ########### fetching the lat and lon coordinates 
                        
                        ########### fetching the name of district
                        district = dicts[index][ind]
                        
                        if district == "NRI": continue
                        
                        ########### Locating the row in DataFrame which we want to update
                        district = district.lower()
                        district = district.strip()
                        r = ricex.index[(ricex['ind_district'] == district) & (ricex['Crop_Year'] == int(year))].tolist()
                        
                        
                        if len(r) == 1:
                            
                            ########### The pixel value for that location
                            px,py = i,j
                            pix = ds.ReadAsArray(px,py,1,1)
                            pix = int(pix[0][0])
                            
                            """ Found the row, so now .."""
                            """ Find Collumn index corresponding to Month, Band """
                            ####### Band Number ########
                            band = filename.split("\\")[-1].split("_")[7:][0].split(".")[0][1]
                            bnd = band
                            if band == '6':
                                if filename.split("\\")[-1].split("_")[7:][2][0] == '1':
                                    bnd = band
                                else:
                                    bnd = '9'
                            elif band == 'Q':
                                bnd = '10'
                                
                                
                            if month[0] == '0': 
                                month = month[1]
                                
                            sm = month + "_B" + bnd +"_M"
                            
                            cm = dictn[sm]
                            
                            r = r[0]
                            # cm is the collumn indexe for mean
                            # r[0] is the row index
                            
                            
                            ##### Checking if values are null ...
                            valm = ricex.iloc[r,cm]
                            if pd.isnull(valm): 
                                
                                ricex.iloc[r,cm] = int(pix)
                                ricex.iloc[r,cm+1] = int(pix*pix)
                                ricex.iloc[r,cm+240] = 1
                                
                                continue
                                
                                
                            ##### if the values are not null ...
                            valv = int(ricex.iloc[r,cm+1])
                            n = int(ricex.iloc[r,cm+240])
                            n = n+1
                            
                            
                            # Mean & Variance update
                            ricex.iloc[r,cm] = valm + (pix-valm)/n
                            ricex.iloc[r,cm+1] = ((n-2)/(n-1))*valv + (pix-valm)*(pix-valm)/n
                            ricex.iloc[r,cm+240] = n
                            
                            
                            
elapsed = timeit.default_timer() - stx
print (elapsed)
print "Seconds"


39356.627305
Seconds

In [89]:
ricex.to_csv("Ricex_prepared.csv")