Due to gdal package, had to make a separate environment using conda. So install packages for this notebook in that environment itself. Check from the anaconda prompt, the names of all the envs are available:
$ conda info --envs
$ activate env_name
In [1]:
from osgeo import ogr, osr, gdal
import fiona
from shapely.geometry import Point, shape
import numpy as np
import pandas as pd
import os
import sys
import tarfile
import timeit
base_ = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census_2011"
base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
In [2]:
# Change this for Win7,macOS
bases = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census\Dist.shp"
# base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
fc = fiona.open(bases)
In [3]:
def reverse_geocode(pt):
for feature in fc:
if shape(feature['geometry']).contains(pt):
return feature['properties']['DISTRICT']
return "NRI"
In [4]:
# base = "/Users/macbook/Documents/BTP/Satellite/Data/Sat" # macOS
base = "G:\BTP\Satellite\Data\Test" # Win7
In [5]:
# b = True
# for directory, subdirList, fileList in os.walk(base):
# # if b:
# # b = False
# # continue
# #print ("Directory: " + directory)
# for filename in fileList:
# if filename[0] != '.': print ("\t" + filename)
In [6]:
def extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(os.path.join(base,root)) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s' % (root, filename))
else:
print('Extracting data for %s' % root)
tar = tarfile.open(os.path.join(base,filename))
sys.stdout.flush()
tar.extractall(os.path.join(base,root))
tar.close()
In [7]:
# extracting all the tar files ... (if not extracted)
for directory, subdirList, fileList in os.walk(base):
for filename in fileList:
if filename.endswith(".tar.gz"):
d = extract(filename)
In [8]:
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories
In [9]:
ds = gdal.Open(base + "\LE07_L1TP_146039_20101223_20161211_01_T1\LE07_L1TP_146039_20101223_20161211_01_T1_B1.TIF")
Prepare one ds
variable here itself, for the transformation of the coordinate system below.
In [10]:
# get the existing coordinate system
old_cs= osr.SpatialReference()
old_cs.ImportFromWkt(ds.GetProjectionRef())
# create the new coordinate system
wgs84_wkt = """
GEOGCS["WGS 84",
DATUM["WGS_1984",
SPHEROID["WGS 84",6378137,298.257223563,
AUTHORITY["EPSG","7030"]],
AUTHORITY["EPSG","6326"]],
PRIMEM["Greenwich",0,
AUTHORITY["EPSG","8901"]],
UNIT["degree",0.01745329251994328,
AUTHORITY["EPSG","9122"]],
AUTHORITY["EPSG","4326"]]"""
new_cs = osr.SpatialReference()
new_cs.ImportFromWkt(wgs84_wkt)
# create a transform object to convert between coordinate systems
transform = osr.CoordinateTransformation(old_cs,new_cs)
In [11]:
def pixel2coord(x, y, xoff, a, b, yoff, d, e):
"""Returns global coordinates from coordinates x,y of the pixel"""
xp = a * x + b * y + xoff
yp = d * x + e * y + yoff
return(xp, yp)
In [12]:
ricep = pd.read_csv("C:\Users\deepak\Desktop\Repo\BTP\Ricep.csv")
ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep["value"] = ricep["Production"]/ricep["Area"]
ricep.head()
Out[12]:
In [13]:
a = np.empty((ricep.shape[0],1))*np.NAN
In [14]:
""" 'features' contain collumn indexes for the new features """
""" 'dictn' is the dictionary mapping name of collumn index to the index number """
features = []
dictn = {}
k = 13
for i in range(1,13):
for j in range(1,11):
s = str(i) + "_B" + str(j) + "_"
features.append(s+"M")
features.append(s+"V")
dictn[s+"M"] = k
dictn[s+"V"] = k+1
k = k+2
In [15]:
for i in range(1,13):
for j in range(1,11):
s = str(i) + "_B" + str(j) + "_"
features.append(s+"Mn")
features.append(s+"Vn")
In [16]:
len(features)
Out[16]:
In [17]:
tmp = pd.DataFrame(index=range(ricep.shape[0]),columns=features)
ricex = pd.concat([ricep,tmp], axis=1)
In [18]:
ricex.head()
Out[18]:
In [21]:
k = 10
hits = 0
In [22]:
start_time = timeit.default_timer()
for directory in directories:
""" Identifying Month, Year, Spacecraft ID """
date = directory.split('\\')[-1].split('_')[3] # Change for Win7
satx = directory.split('\\')[-1][3]
month = date[4:6]
year = date[0:4]
""" Visiting every GeoTIFF file """
for _,_,files in os.walk(directory):
for filename in files:
# make sure not going into the extra folders
if filename.endswith(".TIF"):
# Check for a single iteration. Which step takes the longest time.
# improve that step
# Do it all for B1.tif only.
# for others save the row indexes found in B1's iteration
# so dont have to search the dataframe again and again.
# but keep track of the pixels which were not found, so have to skip them too
# so that correct pixel value goes to the correct row in dataframe
# have to traverse the tiff file to get the pixel values
# what all steps are common for all the 10 tif files
# the district search from the pixel's lat,lon coordinates
# the row index search using district and the year
# the same n is read and wrote for all the ...
# If nothing works, maybe we can reduce the number of features, by just visiting
# First 5 TIF files for each scene.
print os.path.join(directory,filename)
ds = gdal.Open(os.path.join(directory,filename))
if ds == None: continue
col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
xoff, a, b, yoff, d, e = ds.GetGeoTransform()
""" Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
""" Find the row with same (Year,District), in Crop Dataset. """
""" Find the feature using Month, Band, SATx """
""" For this have to find Mean & Variance """
for i in range(0,col,col/k):
for j in range(0,row,row/k):
########### fetching the lat and lon coordinates
x,y = pixel2coord(i, j, xoff, a, b, yoff, d, e)
lonx, latx, z = transform.TransformPoint(x,y)
########### fetching the name of district
point = Point(lonx,latx)
district = reverse_geocode(point)
if district == "NRI": continue
########### The pixel value for that location
px,py = i,j
pix = ds.ReadAsArray(px,py,1,1)
pix = pix[0][0]
########### Locating the row in DataFrame which we want to update
district = district.lower()
district = district.strip()
r = ricex.index[(ricex['ind_district'] == district) & (ricex['Crop_Year'] == int(year))].tolist()
if len(r) == 1:
""" Found the row, so now .."""
""" Find Collumn index corresponding to Month, Band """
hits = hits + 1
#print ("Hits: ", hits)
####### Band Number ########
band = filename.split("\\")[-1].split("_")[7:][0].split(".")[0][1]
bnd = band
if band == '6':
if filename.split("\\")[-1].split("_")[7:][2][0] == '1':
bnd = band
else:
bnd = '9'
elif band == 'Q':
bnd = '10'
sm = month + "_B" + bnd +"_M"
cm = dictn[sm]
r = r[0]
# cm is the collumn indexe for mean
# r[0] is the row index
##### Checking if values are null ...
valm = ricex.iloc[r,cm]
if pd.isnull(valm):
ricex.iloc[r,cm] = pix
ricex.iloc[r,cm+1] = pix*pix
ricex.iloc[r,cm+240] = 1
continue
##### if the values are not null ...
valv = ricex.iloc[r,cm+1]
n = ricex.iloc[r,cm+240]
n = n+1
# Mean & Variance update
ricex.iloc[r,cm] = valm + (pix-valm)/n
ricex.iloc[r,cm+1] = ((n-2)/(n-1))*valv + (pix-valm)*(pix-valm)/n
ricex.iloc[r,cm+240] = n
#print ("No match for the district " + district + " for the year " + year)
elapsed = timeit.default_timer() - start_time
print (elapsed)
print "Seconds"
In [23]:
ricex.describe()
Out[23]:
In [22]:
ricex.to_csv("ricex_test1.csv")
In [23]:
fc
Out[23]:
In [6]:
fc.schema
Out[6]:
In [7]:
fc.crs
Out[7]:
In [8]:
len(fc)
Out[8]:
In [ ]:
In [24]:
import timeit
In [27]:
a = timeit.default_timer()
for i in range(1000000):
j = 1
b = timeit.default_timer() - a
In [28]:
b
Out[28]:
In [ ]:
In [ ]:
In [10]:
for directory in directories:
""" Identifying Month, Year, Spacecraft ID """
date = directory.split('\\')[-1].split('_')[3] # Change for Win7
satx = directory.split('\\')[-1][3]
month = date[4:6]
year = date[0:4]
print "LANDSAT {}, MONTH: {}, YEAR: {}".format(satx,month,year)
""" Visiting every GeoTIFF file """
for _,_,files in os.walk(directory):
for filename in files:
if filename.endswith(".TIF"):
print filename.split("\\")[-1].split("_")[7:]
ds = gdal.Open(os.path.join(directory,filename))
if ds == None: continue
col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
xoff, a, b, yoff, d, e = ds.GetGeoTransform()
print "Col: {0:6}, Row:{1:6}".format(col,row)
""" Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
""" Find the row with same (Year,District), in Crop Dataset. """
""" Find the feature using Month, Band, SATx """
""" For this have to find Mean & Variance """
So for LANDSAT 7: col,row ~ 8000,7000, with an exception of Band 8, with 16K,14K
Go to the base folder: extract every zip file, which is unextracted:
For each folder present here:
For each tiff file (for each band):
Identify the following:
- Month, Year
- District Name
- Cloud Cover Percentage
- Sat 7 or 8 (maybe from #files in the folder!
According to SAT, meaning of bands change ...(Put them in corresponding features ...)
Traverse every 100th pixel (for sat7 every Kth)
In [ ]: