For the speedup

Extracting District Names into 9 Dictionaries

Because there are only nine different scene locations in the downloaded dataset.

Or if it was way more than 9, then could have sorted the file names first, then only the change of scene location, construct a new dictionary ...



In [1]:

    
from osgeo import ogr, osr, gdal

import fiona
from shapely.geometry import Point, shape

import numpy as np
import pandas as pd

import os
import sys
import tarfile
import timeit



In [2]:

    
# Change this for Win7,macOS
bases = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census\Dist.shp"
# base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
fc = fiona.open(bases)



In [3]:

    
def reverse_geocode(pt):
    for feature in fc:
        if shape(feature['geometry']).contains(pt):
            return feature['properties']['DISTRICT']
    return "NRI"



In [4]:

    
base2 = "G:\BTP\Satellite\Data\Test"  # Win7
base = "G:\BTP\Satellite\Data\Test2"  # Win7



In [5]:

    
def extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    if os.path.isdir(os.path.join(base,root)) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s' % (root, filename))
    else:
        print('Extracting data for %s' % root)
        tar = tarfile.open(os.path.join(base,filename))
        sys.stdout.flush()
        tar.extractall(os.path.join(base,root))
        tar.close()



In [6]:

    
# extracting for Test2
for directory, subdirList, fileList in os.walk(base):
    for filename in fileList:
        if filename.endswith(".tar.gz"): 
            d = extract(filename)









    



LE07_L1GT_146039_20050702_20170115_01_T2 already present - Skipping extraction of LE07_L1GT_146039_20050702_20170115_01_T2.tar.gz
LE07_L1GT_146040_20040512_20170120_01_T2 already present - Skipping extraction of LE07_L1GT_146040_20040512_20170120_01_T2.tar.gz
LE07_L1GT_146041_20041222_20170117_01_T2 already present - Skipping extraction of LE07_L1GT_146041_20041222_20170117_01_T2.tar.gz
LE07_L1GT_147039_20050725_20170114_01_T2 already present - Skipping extraction of LE07_L1GT_147039_20050725_20170114_01_T2.tar.gz
LE07_L1GT_147040_20050506_20170116_01_T2 already present - Skipping extraction of LE07_L1GT_147040_20050506_20170116_01_T2.tar.gz
LE07_L1GT_147041_20040807_20170119_01_T2 already present - Skipping extraction of LE07_L1GT_147041_20040807_20170119_01_T2.tar.gz
LE07_L1GT_148039_20050918_20170113_01_T2 already present - Skipping extraction of LE07_L1GT_148039_20050918_20170113_01_T2.tar.gz
LE07_L1GT_148040_20040627_20170120_01_T2 already present - Skipping extraction of LE07_L1GT_148040_20040627_20170120_01_T2.tar.gz
LE07_L1GT_149039_20050128_20170117_01_T2 already present - Skipping extraction of LE07_L1GT_149039_20050128_20170117_01_T2.tar.gz



In [7]:

    
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories



In [8]:

    
ds = gdal.Open(base2 + "\LE07_L1TP_146039_20101223_20161211_01_T1\LE07_L1TP_146039_20101223_20161211_01_T1_B1.TIF")



In [9]:

    
type(ds)









    Out[9]:





osgeo.gdal.Dataset

Prepare one ds variable here itself, for the transformation of the coordinate system below.



In [10]:

    
# create the new coordinate system
wgs84_wkt = """
GEOGCS["WGS 84",
    DATUM["WGS_1984",
        SPHEROID["WGS 84",6378137,298.257223563,
            AUTHORITY["EPSG","7030"]],
        AUTHORITY["EPSG","6326"]],
    PRIMEM["Greenwich",0,
        AUTHORITY["EPSG","8901"]],
    UNIT["degree",0.01745329251994328,
        AUTHORITY["EPSG","9122"]],
    AUTHORITY["EPSG","4326"]]"""
new_cs = osr.SpatialReference()
new_cs.ImportFromWkt(wgs84_wkt)









    Out[10]:





0



In [11]:

    
def func(dsx):
    old_cs= osr.SpatialReference()
    old_cs.ImportFromWkt(dsx.GetProjectionRef())
    trs = osr.CoordinateTransformation(old_cs,new_cs) 
    return trs



In [12]:

    
def pixel2coord(x, y, xoff, a, b, yoff, d, e):
    """Returns global coordinates from coordinates x,y of the pixel"""
    xp = a * x + b * y + xoff
    yp = d * x + e * y + yoff
    return(xp, yp)



In [ ]:



In [28]:

    
dicts = [[],[],[],[],[],[],[],[],[]]
dictr = [(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0),(0,0)]
dicti = {"146039":0, "146040":1, "146041":2, "147039":3, "147040":4, "147041":5, "148039":6, "148040":7, "149039":8}



In [29]:

    
k = 50



In [30]:

    
stx = timeit.default_timer()

for directory in directories:
    
    """ Identifying Month, Year, Spacecraft ID """
    date = directory.split('\\')[-1].split('_')[3] # Change for Win7
    satx = directory.split('\\')[-1][3]
    month = date[4:6]
    year = date[0:4]
    
    scene = directory.split('\\')[-1].split('_')[2]
    index = dicti[scene]
    
    #if index != 1: continue
    
    """ Visiting every GeoTIFF file """ 
    for _,_,files in os.walk(directory):
        for filename in files:
            
            if filename.endswith(".TIF"):
                
                if filename[-5] == '2': break
                
                print os.path.join(directory,filename)
                
                ds = gdal.Open(os.path.join(directory,filename))
                if ds == None: continue
                col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
                xoff, a, b, yoff, d, e = ds.GetGeoTransform()
                #--------------------
                transform = func(ds)
                #--------------------
                
                dictr[index] = (col,row)
                
                """ Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
                for i in range(0,col,col/k):
                    for j in range(0,row,row/k):
                        
                        ########### fetching the lat and lon coordinates 
                        x,y = pixel2coord(i, j, xoff, a, b, yoff, d, e)
                        lonx, latx, z = transform.TransformPoint(x,y)
                        
                        
                        ########### fetching the name of district
                        
                        point = Point(lonx,latx)
                        district = reverse_geocode(point)
                        
                        dicts[index].append(district)
                        
#                         #----------------------------------------------------------
#                         if filename[-5] == '1':
#                             point = Point(lonx,latx)
#                             district = reverse_geocode(point)
#                             dicts[i][str(lonx)+str(latx)] = district
#                         else:
#                             print "-------------------- !!!!!!! -------------------"
#                             district = dictx[str(lonx)+str(latx)]
#                         #----------------------------------------------------------
                        
            
                            
elapsed = timeit.default_timer() - stx
print (elapsed)
print "Seconds"









    



G:\BTP\Satellite\Data\Test2\LE07_L1GT_146039_20050702_20170115_01_T2\LE07_L1GT_146039_20050702_20170115_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_146040_20040512_20170120_01_T2\LE07_L1GT_146040_20040512_20170120_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_146041_20041222_20170117_01_T2\LE07_L1GT_146041_20041222_20170117_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147039_20050725_20170114_01_T2\LE07_L1GT_147039_20050725_20170114_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147040_20050506_20170116_01_T2\LE07_L1GT_147040_20050506_20170116_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_147041_20040807_20170119_01_T2\LE07_L1GT_147041_20040807_20170119_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_148039_20050918_20170113_01_T2\LE07_L1GT_148039_20050918_20170113_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_148040_20040627_20170120_01_T2\LE07_L1GT_148040_20040627_20170120_01_T2_B1.TIF
G:\BTP\Satellite\Data\Test2\LE07_L1GT_149039_20050128_20170117_01_T2\LE07_L1GT_149039_20050128_20170117_01_T2_B1.TIF
22027.7967823
Seconds



In [31]:

    
print dictr









    



[(7981, 7271), (7741, 7001), (7761, 7021), (7811, 7051), (7801, 7071), (7821, 7051), (7871, 7111), (7861, 7131), (7941, 7181)]



In [32]:

    
print latx
print lonx
print district









    



29.3540173718
74.8772487691
Hanumangarh



In [34]:

    
for i in range(9):
    print len(dicts[i])



In [47]:

    
# print dicts[0]



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [80]:

    
base = "G:\BTP\Satellite\Data\Bulk\Landsat"



In [69]:

    
# base = "G:\BTP\Satellite\Data\Test"



In [81]:

    
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories



In [66]:

    
ricep = pd.read_csv("C:\Users\deepak\Desktop\Repo\BTP\Satellite\Ricep_large.csv")
ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep.head()









    Out[66]:







  
    
      
      State_Name
      ind_district
      Crop_Year
      Season
      Crop
      Area
      Production
      Value
      X1
      X2
    
  
  
    
      0
      Chandigarh
      chandigarh
      2004
      kharif
      Rice
      80
      400.0
      5.0
      500.0
      700.0
    
    
      1
      Chandigarh
      chandigarh
      2005
      kharif
      Rice
      50
      250.0
      5.0
      400.0
      500.0
    
    
      2
      Chandigarh
      chandigarh
      2006
      kharif
      Rice
      50
      250.0
      5.0
      250.0
      400.0
    
    
      3
      Chandigarh
      chandigarh
      2007
      kharif
      Rice
      50
      250.0
      5.0
      250.0
      250.0
    
    
      4
      Chandigarh
      chandigarh
      2008
      kharif
      Rice
      20
      100.0
      5.0
      250.0
      250.0



In [67]:

    
a = np.empty((ricep.shape[0],1))*np.NAN



In [68]:

    
""" 'features' contain collumn indexes for the new features """
""" 'dictn' is the dictionary mapping name of collumn index to the index number """
features = []
dictn = {}
k = 10
for i in range(1,13):
    for j in range(1,11):
        s = str(i) + "_B" + str(j) + "_"
        features.append(s+"M")
        features.append(s+"V")
        dictn[s+"M"] = k
        dictn[s+"V"] = k+1
        k = k+2



In [71]:

    
for i in range(1,13):
    for j in range(1,11):
        s = str(i) + "_B" + str(j) + "_"
        features.append(s+"Mn")
        features.append(s+"Vn")



In [85]:

    
tmp = pd.DataFrame(index=range(ricep.shape[0]),columns=features)
ricex = pd.concat([ricep,tmp], axis=1)



In [86]:

    
k = 50



In [87]:

    
stx = timeit.default_timer()

for directory in directories:
    
#     if bx: continue
#     else: bx = True
    
    """ Identifying Month, Year, Spacecraft ID """
    date = directory.split('\\')[-1].split('_')[3] # Change for Win7
    satx = directory.split('\\')[-1][3]
    month = date[4:6]
    year = date[0:4]
    
    scene = directory.split('\\')[-1].split('_')[2]
    index = dicti[scene]
    
    """ Visiting every GeoTIFF file """ 
    for _,_,files in os.walk(directory):
        for filename in files:
            
            
            if filename.endswith(".TIF"):
                
                if filename[-5] == '8': continue
                        
                ds = gdal.Open(os.path.join(directory,filename))
                if ds == None: continue
                col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
                xoff, a, b, yoff, d, e = ds.GetGeoTransform()
                
                ind = -1
                for i in range(0,col,col/k):
                    for j in range(0,row,row/k):
                        
                        ind += 1
                        if ind>2600: break
                            
                        ########### fetching the lat and lon coordinates 
                        
                        ########### fetching the name of district
                        district = dicts[index][ind]
                        
                        if district == "NRI": continue
                        
                        ########### Locating the row in DataFrame which we want to update
                        district = district.lower()
                        district = district.strip()
                        r = ricex.index[(ricex['ind_district'] == district) & (ricex['Crop_Year'] == int(year))].tolist()
                        
                        
                        if len(r) == 1:
                            
                            ########### The pixel value for that location
                            px,py = i,j
                            pix = ds.ReadAsArray(px,py,1,1)
                            pix = int(pix[0][0])
                            
                            """ Found the row, so now .."""
                            """ Find Collumn index corresponding to Month, Band """
                            ####### Band Number ########
                            band = filename.split("\\")[-1].split("_")[7:][0].split(".")[0][1]
                            bnd = band
                            if band == '6':
                                if filename.split("\\")[-1].split("_")[7:][2][0] == '1':
                                    bnd = band
                                else:
                                    bnd = '9'
                            elif band == 'Q':
                                bnd = '10'
                                
                                
                            if month[0] == '0': 
                                month = month[1]
                                
                            sm = month + "_B" + bnd +"_M"
                            
                            cm = dictn[sm]
                            
                            r = r[0]
                            # cm is the collumn indexe for mean
                            # r[0] is the row index
                            
                            
                            ##### Checking if values are null ...
                            valm = ricex.iloc[r,cm]
                            if pd.isnull(valm): 
                                
                                ricex.iloc[r,cm] = int(pix)
                                ricex.iloc[r,cm+1] = int(pix*pix)
                                ricex.iloc[r,cm+240] = 1
                                
                                continue
                                
                                
                            ##### if the values are not null ...
                            valv = int(ricex.iloc[r,cm+1])
                            n = int(ricex.iloc[r,cm+240])
                            n = n+1
                            
                            
                            # Mean & Variance update
                            ricex.iloc[r,cm] = valm + (pix-valm)/n
                            ricex.iloc[r,cm+1] = ((n-2)/(n-1))*valv + (pix-valm)*(pix-valm)/n
                            ricex.iloc[r,cm+240] = n
                            
                            
                            
elapsed = timeit.default_timer() - stx
print (elapsed)
print "Seconds"









    



39356.627305
Seconds



In [89]:

    
ricex.to_csv("Ricex_prepared.csv")

	State_Name	ind_district	Crop_Year	Season	Crop	Area	Production	Value	X1	X2
0	Chandigarh	chandigarh	2004	kharif	Rice	80	400.0	5.0	500.0	700.0
1	Chandigarh	chandigarh	2005	kharif	Rice	50	250.0	5.0	400.0	500.0
2	Chandigarh	chandigarh	2006	kharif	Rice	50	250.0	5.0	250.0	400.0
3	Chandigarh	chandigarh	2007	kharif	Rice	50	250.0	5.0	250.0	250.0
4	Chandigarh	chandigarh	2008	kharif	Rice	20	100.0	5.0	250.0	250.0