In [1]:

    
from sys import path
path.append("../../")

import pandas as pd
import seaborn as sns

from matplotlib import pyplot
from utils.data_cube_utilities import dc_display_map


%matplotlib inline

Load Truth Data

Our uruguay data comes in a csv format. It contains three attributes:

latitude
longitude
landcover class



In [2]:

    
df = pd.read_csv('../data.csv')
df.head()









    Out[2]:







  
    
      
      X
      Y
      LandUse
      Latitude
      Longitude
    
  
  
    
      0
      618522.8899
      6218452.029
      Forestry
      -34.168155
      -55.714061
    
    
      1
      649042.3861
      6199719.402
      Forestry
      -34.333113
      -55.379803
    
    
      2
      655993.6095
      6200009.045
      Forestry
      -34.329479
      -55.304319
    
    
      3
      656440.0958
      6200236.666
      Forestry
      -34.327360
      -55.299509
    
    
      4
      656825.7859
      6199755.161
      Forestry
      -34.331642
      -55.295231

Label distribution

In this section, data is binned by landcover and counted. Landcover classes with little to no labels will be unreliable candidates for classification as there may not be enough variance in the training labels to guarantee that the model learns to generalize.



In [3]:

    
df.groupby("LandUse").size()









    Out[3]:





LandUse
Forestry              6
Fruittrees            3
Nativeforest         13
Naturalgrassland    175
Other                 8
Prairie              59
Stubble               9
Summercrops         150
Water                 1
dtype: int64



In [4]:

    
fig, ax = pyplot.subplots(figsize=(15,3))
sns.countplot(x="LandUse",data=df, palette="Greens_d");

Re-Labeling

Related classes are combined to boost the number of samples in the new classes.



In [5]:

    
df_new = df.copy()  
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Forest" if x in ["Forestry","Fruittrees","Nativeforest"] else x ))
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Misc" if x  not in ["Forest","Prairie","Summercrops","Naturalgrassland"] else x ))



In [6]:

    
df_new.groupby("LandUse").size()









    Out[6]:





LandUse
Forest               22
Misc                 18
Naturalgrassland    175
Prairie              59
Summercrops         150
dtype: int64



In [7]:

    
fig, ax = pyplot.subplots(figsize=(15,5))
sns.countplot(x="LandUse",data=df_new, palette="Greens_d");

Visualize Label Distribution



In [8]:

    
dc_display_map.display_grouped_pandas_rows_as_pins(df_new, group_name= "LandUse")









    Out[8]:

Export re-labled data



In [9]:

    
output_destination_name = "./relabeled_data.csv"



In [10]:

    
## Recap of structure
df_new.head()









    Out[10]:







  
    
      
      X
      Y
      LandUse
      Latitude
      Longitude
    
  
  
    
      0
      618522.8899
      6218452.029
      Forest
      -34.168155
      -55.714061
    
    
      1
      649042.3861
      6199719.402
      Forest
      -34.333113
      -55.379803
    
    
      2
      655993.6095
      6200009.045
      Forest
      -34.329479
      -55.304319
    
    
      3
      656440.0958
      6200236.666
      Forest
      -34.327360
      -55.299509
    
    
      4
      656825.7859
      6199755.161
      Forest
      -34.331642
      -55.295231



In [11]:

    
df_new.to_csv(output_destination_name)



In [12]:

    
!ls









    



'1. Data Exploration.ipynb'		    __init__.py
'2. Feature Engineering.ipynb'		    classifiers
'3. Model Building and Evaluation.ipynb'    engineered_features.csv
'4. Display and Package Classifier.ipynb'   engineered_features_original.csv
'Old Notebooks'				    feature_engineering.PNG
 README.md				    relabeled_data.csv

	X	Y	LandUse	Latitude	Longitude
0	618522.8899	6218452.029	Forestry	-34.168155	-55.714061
1	649042.3861	6199719.402	Forestry	-34.333113	-55.379803
2	655993.6095	6200009.045	Forestry	-34.329479	-55.304319
3	656440.0958	6200236.666	Forestry	-34.327360	-55.299509
4	656825.7859	6199755.161	Forestry	-34.331642	-55.295231

	X	Y	LandUse	Latitude	Longitude
0	618522.8899	6218452.029	Forest	-34.168155	-55.714061
1	649042.3861	6199719.402	Forest	-34.333113	-55.379803
2	655993.6095	6200009.045	Forest	-34.329479	-55.304319
3	656440.0958	6200236.666	Forest	-34.327360	-55.299509
4	656825.7859	6199755.161	Forest	-34.331642	-55.295231