notebook.community

Edit and run



In [1]:

    
# A script to impute missing values for for stand age in forest inventory plots



In [2]:

    
# load our libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt



In [3]:

    
# read in the PNW FIA Database, with plot("condition")-level summary data
PNW_FIADB = pd.read_csv("G:/projects/ForestPlanner_2015/Data/Work/IDB_SUMMARY_2015-10-23.csv")
PNW_FIADB.head()









    Out[3]:






  
    
      
      PLOT_ID
      COND_ID
      LATITUDE_FUZZ
      LONGITUDE_FUZZ
      Calc_SLOPE
      Calc_ASPECT
      ELEV_FT
      STAND_SIZE_CLASS
      QMD_TOT_CM
      SITE_CLASS_FIA
      AGE_Calc
      FIA_FOREST_TYPE_NAME
      FOR_TYPE
      FOR_TYPE_SECDRY
      MAI
      SumOfBA_FT2_AC
      SumOfBIOM_TR_ABV_GRND_TON
      SumOfVOL_AC_GRS_FT3
      GLC_GROUP
    
  
  
    
      0
      3
      105
      44.70
      -123.32
      0
      0
      394
      3
      91.70
      3
      65
      Douglas-fir
      202
      NaN
      129.59
      23.64
      4.69
      761.00
      F
    
    
      1
      3
      108
      44.70
      -123.32
      0
      0
      394
      2
      10.81
      3
      15
      Oregon white oak
      815
      NaN
      129.59
      14.37
      0.07
      107.55
      F
    
    
      2
      4
      133
      44.64
      -123.43
      0
      0
      600
      3
      23.05
      NaN
      35
      Scotch pine
      130
      NaN
      NaN
      112.93
      0.99
      2495.75
      F
    
    
      3
      5
      148
      44.69
      -123.43
      30
      360
      600
      3
      38.78
      2
      45
      Douglas-fir
      202
      NaN
      191.88
      152.49
      26.76
      5614.54
      F
    
    
      4
      6
      174
      44.71
      -123.50
      30
      315
      1201
      3
      35.51
      2
      35
      Douglas-fir
      202
      NaN
      201.86
      122.00
      3.15
      4521.56
      F



In [4]:

    
# rename a few columns
PNW_FIADB.rename(columns={'AGE_Calc': 'COND_AGE', 'LATITUDE_FUZZ': 'LAT', 'LONGITUDE_FUZZ': 'LON', 
                          'Calc_SLOPE': 'SLOPE', 'Calc_ASPECT': 'ASPECT', 'ELEV_FT': 'ELEV'}, inplace=True)



In [5]:

    
# set thte COND_CN field as the dataframe index
PNW_FIADB.set_index("COND_ID", inplace = True)



In [6]:

    
PNW_FIADB.describe()









    Out[6]:






  
    
      
      PLOT_ID
      LAT
      LON
      SLOPE
      ASPECT
      ELEV
      STAND_SIZE_CLASS
      QMD_TOT_CM
      SITE_CLASS_FIA
      COND_AGE
      FOR_TYPE
      FOR_TYPE_SECDRY
      MAI
      SumOfBA_FT2_AC
      SumOfBIOM_TR_ABV_GRND_TON
      SumOfVOL_AC_GRS_FT3
    
  
  
    
      count
      24259.000000
      24259.000000
      24259.000000
      24255.000000
      24255.000000
      24205.000000
      24243.000000
      23884.000000
      23979.000000
      22986.000000
      24259.000000
      2613.000000
      18139.000000
      23752.000000
      24207.000000
      24249.000000
    
    
      mean
      14674.469599
      43.690303
      -121.033802
      30.780952
      196.104473
      3811.497253
      3.257848
      32.738325
      4.546436
      106.981641
      220.183478
      389.652507
      95.813147
      125.565238
      44.594977
      3480.787005
    
    
      std
      8593.758660
      3.437008
      1.972498
      23.255176
      110.734799
      1970.490619
      1.275918
      17.190702
      1.605430
      79.706468
      193.797361
      235.204377
      59.413856
      92.316470
      72.856389
      3728.646719
    
    
      min
      3.000000
      32.740000
      -124.710000
      0.000000
      0.000000
      3.000000
      0.000000
      0.100000
      1.000000
      0.000000
      11.000000
      11.000000
      -10.370000
      0.040000
      0.000000
      0.000000
    
    
      25%
      6916.500000
      41.685000
      -122.530000
      10.000000
      90.000000
      2362.000000
      3.000000
      21.270000
      3.000000
      45.000000
      108.000000
      202.000000
      49.175000
      53.665000
      4.730000
      828.760000
    
    
      50%
      14024.000000
      44.150000
      -121.380000
      28.000000
      180.000000
      4000.000000
      3.000000
      31.650000
      5.000000
      95.000000
      202.000000
      351.000000
      81.610000
      106.940000
      17.070000
      2243.240000
    
    
      75%
      22076.000000
      46.140000
      -119.430000
      47.000000
      270.000000
      5180.000000
      4.000000
      42.730000
      6.000000
      155.000000
      263.000000
      431.000000
      132.820000
      179.980000
      50.070000
      4884.390000
    
    
      max
      30942.000000
      49.000000
      -116.350000
      999.000000
      999.000000
      11299.000000
      7.000000
      279.800000
      7.000000
      300.000000
      999.000000
      999.000000
      556.510000
      1124.690000
      1631.090000
      68646.200000



In [7]:

    
# take a look at the histogram of ages
PNW_FIADB.COND_AGE.hist(bins=50)









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x1f088860>



In [8]:

    
# some of our columns need to be formatted as factors/categorical variables
# FORTYPCD (Forest Type), SITECLCD (Site Class), SOIL_ROOTING_DEPTH_PNW, 
# STND_COND_CD_PNWRS (Stand condition code, e.g., grass-forb, open sawtimber, closed sapling-pole-sawtimber, old-growth)
# STND_STRUC_CD_PNWRS (Stand Structure, e.g., even-aged single story, two-story, uneven-aged, mosaic), 
# PHYSCLCD (Soil-Climate Type)
cat_cols = ["STAND_SIZE_CLASS", "SITE_CLASS_FIA", "FOR_TYPE", "FOR_TYPE_SECDRY"]
for col in cat_cols:
    if col in ["FOR_TYPE", "FOR_TYPE_SECDRY"]:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category')
    else:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category').cat.as_ordered()



In [9]:

    
# How many missing values do we have
print("Out of " + str(len(PNW_FIADB)) + " plots:")

# create a list of fields (other than age) with nulls
hasNaNs = []

for col in PNW_FIADB.columns.values:
    print(col + ": " + str(PNW_FIADB[col].isnull().sum()) + " nulls")
    if col != "COND_AGE" and PNW_FIADB[col].isnull().sum() >0:
        hasNaNs.append(col)
print(hasNaNs)









    



Out of 24259 plots:
PLOT_ID: 0 nulls
LAT: 0 nulls
LON: 0 nulls
SLOPE: 4 nulls
ASPECT: 4 nulls
ELEV: 54 nulls
STAND_SIZE_CLASS: 16 nulls
QMD_TOT_CM: 375 nulls
SITE_CLASS_FIA: 280 nulls
COND_AGE: 1273 nulls
FIA_FOREST_TYPE_NAME: 0 nulls
FOR_TYPE: 0 nulls
FOR_TYPE_SECDRY: 21646 nulls
MAI: 6120 nulls
SumOfBA_FT2_AC: 507 nulls
SumOfBIOM_TR_ABV_GRND_TON: 52 nulls
SumOfVOL_AC_GRS_FT3: 10 nulls
GLC_GROUP: 0 nulls
['SLOPE', 'ASPECT', 'ELEV', 'STAND_SIZE_CLASS', 'QMD_TOT_CM', 'SITE_CLASS_FIA', 'FOR_TYPE_SECDRY', 'MAI', 'SumOfBA_FT2_AC', 'SumOfBIOM_TR_ABV_GRND_TON', 'SumOfVOL_AC_GRS_FT3']



In [10]:

    
# See how many plots with missing ages also have missing nulls in other columns
for col in hasNaNs:
    print(col, str(PNW_FIADB.COND_AGE.loc[PNW_FIADB[col].isnull()].isnull().sum()))









    



('SLOPE', '1')
('ASPECT', '1')
('ELEV', '9')
('STAND_SIZE_CLASS', '16')
('QMD_TOT_CM', '54')
('SITE_CLASS_FIA', '115')
('FOR_TYPE_SECDRY', '1237')
('MAI', '1174')
('SumOfBA_FT2_AC', '25')
('SumOfBIOM_TR_ABV_GRND_TON', '16')
('SumOfVOL_AC_GRS_FT3', '4')



In [11]:

    
# columns of important predictor variables
dropNaNcols = ["STAND_SIZE_CLASS", "QMD_TOT_CM", "SumOfBA_FT2_AC", "SumOfBIOM_TR_ABV_GRND_TON", "SumOfVOL_AC_GRS_FT3"]



In [12]:

    
# We could try to fill in missing values for some predictor variables, or just ignore them
# Let's see whether random forest thinks these values are helpful when we train the model 
# on a dataset that drops the plots where values are not missing
# and also drops the columns that aren't predictors (PLT_CN, FORTYPE)
noNaNs = PNW_FIADB.dropna(axis=0, how='any', thresh=None, subset=dropNaNcols, inplace=False)



In [13]:

    
# Number of plots with complete predictor variables
print(str(len(noNaNs))+ " plots with complete predictor variables")









    



23573 plots with complete predictor variables



In [14]:

    
# Create a random forest training set of data from all plots with ages
train = noNaNs.dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)



In [15]:

    
# set parameters for random forest regression
randomforest = RandomForestRegressor(n_estimators = 100, oob_score=True, random_state = 54321)



In [16]:

    
# train randomforest on plots within the age range we care about
# return the ranked feature importances for that random forest model
# use that model to predict the ages for all plots where ages are known 
# (including those outside the age range used to train the model)

# set which data we're using to train the model 
# "train" includes all conditions with measured ages and all predictor variables
age_thresh = 150
training_set = train.loc[train.COND_AGE < age_thresh]

# Drop columns of variables that don't seem important as predictors
droplabels = ["SLOPE", "ASPECT", "ELEV", "FOR_TYPE_SECDRY", "MAI", "SITE_CLASS_FIA"]
droplabels.append("COND_AGE")
droplabels.append("PLOT_ID")
droplabels.append("FIA_FOREST_TYPE_NAME")
droplabels.append("GLC_GROUP")
X, y = training_set.drop(labels=droplabels, axis=1), training_set["COND_AGE"]

# train a random forest on the data subset with known ages
randomforest.fit(X,y)

# Gather the feature importances
importances = randomforest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for feature in range(len(training_set.drop(labels=droplabels, axis=1).columns.values)):
    print("%d. %s (%f)" % (feature + 1, training_set.drop(labels=droplabels, axis=1).columns.values[indices[feature]], importances[indices[feature]]))

# Measures of fit
print("-----")
print("R-squared on training set (vs OOB sample): " + str(randomforest.oob_score_))

# Turn random forest age predictions into a series
RF_ages = pd.Series(randomforest.predict(train.drop(labels=droplabels, axis=1)))
RF_ages.name = "RF_AGE"

# Make a dataframe with measured and RF-predicted ages
RF_preds = pd.concat([train.COND_AGE.reset_index(), RF_ages], axis = 1)

# Calculate RMSE for various age ranges and print it
# this is, on average, how close we are to the actual age (in years)
def RMSE(measured, predicted):
    return sqrt(mean_squared_error(measured,predicted))

print("-----")
print("RMSE for training set: " + str(RMSE(RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh], RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh])))
print("Overall RMSE " + str(RMSE(RF_preds.COND_AGE, RF_preds.RF_AGE)))

AgeRanges = [[0,10],[10,25],[25,50],[50,75],[75,100],[100,150],[150,200],[200,300]]
for agerange in AgeRanges:
    print("Age Range -> " +str(agerange)),
    print("RMSE " + str(RMSE(
            RF_preds.COND_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])],
            RF_preds.RF_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])])))









    



Feature ranking:
1. STAND_SIZE_CLASS (0.246499)
2. SumOfBA_FT2_AC (0.231842)
3. LON (0.156995)
4. SumOfBIOM_TR_ABV_GRND_TON (0.129390)
5. LAT (0.100310)
6. QMD_TOT_CM (0.065008)
7. SumOfVOL_AC_GRS_FT3 (0.039546)
8. FOR_TYPE (0.030410)
-----
R-squared on training set (vs OOB sample): 0.736874299791
-----
RMSE for training set: 8.16704531767
Overall RMSE 61.6558892218
Age Range -> [0, 10] RMSE 4.85963633989
Age Range -> [10, 25] RMSE 7.82678233138
Age Range -> [25, 50] RMSE 8.26727036181
Age Range -> [50, 75] RMSE 6.22518293686
Age Range -> [75, 100] RMSE 7.11735237749
Age Range -> [100, 150] RMSE 10.4067481407
Age Range -> [150, 200] RMSE 65.3338621056
Age Range -> [200, 300] RMSE 139.765639572



In [17]:

    
# Data to plot
x = RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh]
y = RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh]

plt.figure()
plt.grid(True)
plt.scatter(x, y, alpha = 0.03, facecolors='none', edgecolors='r')
plt.title("How well does Random Forest predict stand ages?")
plt.xlabel("Measured Stand Age [yrs]")
plt.ylabel("Predicted Stand Age [yrs]")
plt.xlim(0,age_thresh)
plt.ylim(0,age_thresh)

# calculate and plot bestfit line with np.polyfit
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, 'r--', alpha = 0.65)

# add a 1:1 line
plt.plot([0, 350], [0, 350], 'k--')

plt.show()



In [18]:

    
# Predict ages for conditions without ages

# dataframe of conditions without ages, but have all predictor variables
noAges = noNaNs.loc[noNaNs.COND_AGE != noNaNs.COND_AGE]

# RF predictions of ages for conditions without measured ages
RFpred_noAges = pd.Series(randomforest.predict(noAges.drop(labels=droplabels, axis=1))).astype('int')
RFpred_noAges.name = "RF_Age"

# bring predicted ages into the dataframe
RFages_imputed = pd.concat([noAges.reset_index(), RFpred_noAges], axis = 1).set_index("COND_ID")



In [19]:

    
# Overwrite COND_AGE with RF_Age on PNW FIA DB 2011 dataframe
print(str(len(PNW_FIADB)) + " conditions total")
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing before impute")
PNW_FIADB.COND_AGE.loc[PNW_FIADB.COND_AGE != PNW_FIADB.COND_AGE] = RFages_imputed["RF_Age"]
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing after impute")
print(str(len(PNW_FIADB)-PNW_FIADB.COND_AGE.isnull().sum()) + " conditions with ages available")









    



24259 conditions total
1273 ages missing before impute
69 ages missing after impute
24190 conditions with ages available






    



C:\Users\ddiaz\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\indexing.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [20]:

    
# write condition unique ID, LAT, LON, slope, aspect, elevation, and stand age to a csv
# based on idb_summary format used in Forest Planner

PNW_FIADB2011_cond_summary = PNW_FIADB[["LAT", "LON", "ELEV", "ASPECT", "SLOPE", "COND_AGE"]].dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)
PNW_FIADB2011_cond_summary[["COND_AGE"]] = PNW_FIADB2011_cond_summary[["COND_AGE"]].astype(int)
PNW_FIADB2011_cond_summary.to_csv("G:/projects/ForestPlanner_2015/Data/Processed/IDB2pt0_COND_SUMMARY_ages-imputed_2015-11-02.csv", header = True, index = True)

	PLOT_ID	COND_ID	LATITUDE_FUZZ	LONGITUDE_FUZZ	Calc_SLOPE	Calc_ASPECT	ELEV_FT	STAND_SIZE_CLASS	QMD_TOT_CM	SITE_CLASS_FIA	AGE_Calc	FIA_FOREST_TYPE_NAME	FOR_TYPE	FOR_TYPE_SECDRY	MAI	SumOfBA_FT2_AC	SumOfBIOM_TR_ABV_GRND_TON	SumOfVOL_AC_GRS_FT3	GLC_GROUP
0	3	105	44.70	-123.32	0	0	394	3	91.70	3	65	Douglas-fir	202	NaN	129.59	23.64	4.69	761.00	F
1	3	108	44.70	-123.32	0	0	394	2	10.81	3	15	Oregon white oak	815	NaN	129.59	14.37	0.07	107.55	F
2	4	133	44.64	-123.43	0	0	600	3	23.05	NaN	35	Scotch pine	130	NaN	NaN	112.93	0.99	2495.75	F
3	5	148	44.69	-123.43	30	360	600	3	38.78	2	45	Douglas-fir	202	NaN	191.88	152.49	26.76	5614.54	F
4	6	174	44.71	-123.50	30	315	1201	3	35.51	2	35	Douglas-fir	202	NaN	201.86	122.00	3.15	4521.56	F

	PLOT_ID	LAT	LON	SLOPE	ASPECT	ELEV	STAND_SIZE_CLASS	QMD_TOT_CM	SITE_CLASS_FIA	COND_AGE	FOR_TYPE	FOR_TYPE_SECDRY	MAI	SumOfBA_FT2_AC	SumOfBIOM_TR_ABV_GRND_TON	SumOfVOL_AC_GRS_FT3
count	24259.000000	24259.000000	24259.000000	24255.000000	24255.000000	24205.000000	24243.000000	23884.000000	23979.000000	22986.000000	24259.000000	2613.000000	18139.000000	23752.000000	24207.000000	24249.000000
mean	14674.469599	43.690303	-121.033802	30.780952	196.104473	3811.497253	3.257848	32.738325	4.546436	106.981641	220.183478	389.652507	95.813147	125.565238	44.594977	3480.787005
std	8593.758660	3.437008	1.972498	23.255176	110.734799	1970.490619	1.275918	17.190702	1.605430	79.706468	193.797361	235.204377	59.413856	92.316470	72.856389	3728.646719
min	3.000000	32.740000	-124.710000	0.000000	0.000000	3.000000	0.000000	0.100000	1.000000	0.000000	11.000000	11.000000	-10.370000	0.040000	0.000000	0.000000
25%	6916.500000	41.685000	-122.530000	10.000000	90.000000	2362.000000	3.000000	21.270000	3.000000	45.000000	108.000000	202.000000	49.175000	53.665000	4.730000	828.760000
50%	14024.000000	44.150000	-121.380000	28.000000	180.000000	4000.000000	3.000000	31.650000	5.000000	95.000000	202.000000	351.000000	81.610000	106.940000	17.070000	2243.240000
75%	22076.000000	46.140000	-119.430000	47.000000	270.000000	5180.000000	4.000000	42.730000	6.000000	155.000000	263.000000	431.000000	132.820000	179.980000	50.070000	4884.390000
max	30942.000000	49.000000	-116.350000	999.000000	999.000000	11299.000000	7.000000	279.800000	7.000000	300.000000	999.000000	999.000000	556.510000	1124.690000	1631.090000	68646.200000