In [1]:
# A script to impute missing values for for stand age in forest inventory plots
# Reads in plot-level summary data of PNWFIADB 2011 Database
# These plot-level summaries were exported as a .csv from the Access Database

In [2]:
# load our libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt

In [3]:
# read in the PNW FIA Database, with plot("condition")-level summary data
PNW_FIADB = pd.read_csv("G:/projects/ForestPlanner_2015/Data/Work/PNWFIADB_COND_SUMMARY_2015-10-27.csv")
PNW_FIADB.head()


Out[3]:
COND_CN PLT_CN LAT LON COND_AGE FORTYPCD FORTYPE BALIVE ALSTK GSSTK SITECLCD SLOPE ASPECT ELEV SOIL_ROOTING_DEPTH_PNW STND_COND_CD_PNWRS STND_STRUC_CD_PNWRS PHYSCLCD SumOfVOLCFGRS
0 1150348010901 5931296010901 48.37 -122.11 60 301 Western hemlock 242.81 74.53 71.03 3 35 101 2300 2 4 1 23 5168.83
1 1150352010901 5931423010901 48.40 -121.19 310 301 Western hemlock 324.08 83.82 83.82 4 95 265 3700 1 7 3 23 23132.74
2 1150542010901 5936567010901 45.56 -116.65 85 201 Douglas-fir 108.47 38.55 38.55 5 49 288 5900 1 5 3 12 2589.52
3 1155367010901 5930505010901 38.49 -122.36 80 924 Blue oak 86.79 69.07 69.07 7 40 135 500 2 5 1 22 1535.75
4 1157874010901 5952962010901 40.71 -121.42 60 371 California mixed conifer 179.40 53.57 53.57 4 40 325 4100 1 4 3 19 565.55

In [4]:
# set thte COND_CN field as the dataframe index
PNW_FIADB.set_index("COND_CN", inplace = True)

In [5]:
PNW_FIADB.describe()


Out[5]:
PLT_CN LAT LON COND_AGE FORTYPCD BALIVE ALSTK GSSTK SITECLCD SLOPE ASPECT ELEV SOIL_ROOTING_DEPTH_PNW STND_COND_CD_PNWRS STND_STRUC_CD_PNWRS PHYSCLCD SumOfVOLCFGRS
count 1.884400e+04 18844.000000 18844.000000 18006.000000 18844.000000 18844.000000 18844.000000 18844.000000 18844.000000 18844.000000 18844.000000 18844.000000 16894.000000 15715.000000 16894.000000 18844.000000 18558.000000
mean 2.642437e+13 44.512426 -123.143173 101.070921 414.833793 140.879385 55.437669 53.445928 4.763744 31.479516 158.255678 3227.579070 1.704451 4.075469 2.114893 20.392167 6720.808023
std 1.076066e+13 5.762961 6.191382 86.511111 286.535314 269.981017 28.179725 28.786750 1.678542 24.893576 114.819885 2225.768016 0.456303 1.427482 0.977156 4.600012 12049.774469
min 5.930347e+12 32.670000 -154.000000 0.000000 122.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 11.000000 0.810000
25% 2.293178e+13 40.710000 -123.380000 45.000000 221.000000 51.505000 33.660000 31.557500 3.000000 10.000000 50.000000 1300.000000 1.000000 3.000000 1.000000 19.000000 654.190000
50% 2.413435e+13 43.910000 -121.860000 80.000000 281.000000 113.380000 53.670000 52.040000 5.000000 26.000000 156.000000 3100.000000 2.000000 4.000000 2.000000 22.000000 2368.650000
75% 2.988225e+13 47.100000 -120.340000 125.000000 371.000000 198.272500 75.110000 73.610000 6.000000 50.000000 260.000000 4900.000000 2.000000 5.000000 3.000000 23.000000 7056.775000
max 6.349610e+13 61.450000 -114.150000 997.000000 999.000000 31072.230000 126.660000 126.660000 7.000000 155.000000 360.000000 11700.000000 2.000000 7.000000 4.000000 39.000000 350734.050000

In [6]:
# take a look at the histogram of ages
PNW_FIADB.COND_AGE.hist(bins=50)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f0c2860>

In [7]:
# some of our columns need to be formatted as factors/categorical variables
# FORTYPCD (Forest Type), SITECLCD (Site Class), SOIL_ROOTING_DEPTH_PNW, 
# STND_COND_CD_PNWRS (Stand condition code, e.g., grass-forb, open sawtimber, closed sapling-pole-sawtimber, old-growth)
# STND_STRUC_CD_PNWRS (Stand Structure, e.g., even-aged single story, two-story, uneven-aged, mosaic), 
# PHYSCLCD (Soil-Climate Type)
cat_cols = ["FORTYPCD", "FORTYPE", "SOIL_ROOTING_DEPTH_PNW", "SITECLCD", "STND_COND_CD_PNWRS", "STND_STRUC_CD_PNWRS", "PHYSCLCD"]
for col in cat_cols:
    if col in ["PHYSCLCD", "FORTYPCD", "FORTYPE"]:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category')
    else:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category').cat.as_ordered()

In [8]:
# How many missing values do we have
print("Out of " + str(len(PNW_FIADB)) + " plots:")

# create a list of fields (other than age) with nulls
hasNaNs = []

for col in PNW_FIADB.columns.values:
    print(col + ": " + str(PNW_FIADB[col].isnull().sum()) + " nulls")
    if col != "COND_AGE" and PNW_FIADB[col].isnull().sum() >0:
        hasNaNs.append(col)
print(hasNaNs)


Out of 18844 plots:
PLT_CN: 0 nulls
LAT: 0 nulls
LON: 0 nulls
COND_AGE: 838 nulls
FORTYPCD: 0 nulls
FORTYPE: 0 nulls
BALIVE: 0 nulls
ALSTK: 0 nulls
GSSTK: 0 nulls
SITECLCD: 0 nulls
SLOPE: 0 nulls
ASPECT: 0 nulls
ELEV: 0 nulls
SOIL_ROOTING_DEPTH_PNW: 1950 nulls
STND_COND_CD_PNWRS: 3129 nulls
STND_STRUC_CD_PNWRS: 1950 nulls
PHYSCLCD: 0 nulls
SumOfVOLCFGRS: 286 nulls
['SOIL_ROOTING_DEPTH_PNW', 'STND_COND_CD_PNWRS', 'STND_STRUC_CD_PNWRS', 'SumOfVOLCFGRS']

In [9]:
# See how many plots with missing ages also have missing nulls in other columns
for col in hasNaNs:
    print(col, str(PNW_FIADB.COND_AGE.loc[PNW_FIADB[col].isnull()].isnull().sum()))


('SOIL_ROOTING_DEPTH_PNW', '3')
('STND_COND_CD_PNWRS', '117')
('STND_STRUC_CD_PNWRS', '3')
('SumOfVOLCFGRS', '9')

In [10]:
# columns of important predictor variables
dropNaNcols = ["SumOfVOLCFGRS", "STND_STRUC_CD_PNWRS"]

In [11]:
# We could try to fill in missing values for some predictor variables, or just ignore them
# Let's see whether random forest thinks these values are helpful when we train the model 
# on a dataset that drops the plots where values are not missing
# and also drops the columns that aren't predictors (PLT_CN, FORTYPE)
noNaNs = PNW_FIADB.dropna(axis=0, how='any', thresh=None, subset=dropNaNcols, inplace=False)

In [12]:
# Number of plots with complete predictor variables
print(str(len(noNaNs))+ " plots with complete predictor variables")


16655plots with complete predictor variables

In [13]:
# Create a random forest training set of data from all plots with ages
train = noNaNs.dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)

In [14]:
# set parameters for random forest regression
randomforest = RandomForestRegressor(n_estimators = 100, oob_score=True, random_state = 54321)

In [15]:
# train randomforest on plots within the age range we care about
# return the ranked feature importances for that random forest model
# use that model to predict the ages for all plots where ages are known 
# (including those outside the age range used to train the model)

# set which data we're using to train the model 
# "train" includes all conditions with measured ages and all predictor variables
age_thresh = 150
training_set = train.loc[train.COND_AGE < age_thresh]

# Drop columns of variables that don't seem important as predictors
droplabels = ["SOIL_ROOTING_DEPTH_PNW", "PHYSCLCD", "SLOPE", "ALSTK", "GSSTK", "ASPECT", "ELEV", "LAT", "LON", "STND_COND_CD_PNWRS"]
droplabels.append("COND_AGE")
droplabels.append("PLT_CN")
droplabels.append("FORTYPE")
X, y = training_set.drop(labels=droplabels, axis=1), training_set["COND_AGE"]

# train a random forest on the data subset with known ages
randomforest.fit(X,y)

# Gather the feature importances
importances = randomforest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for feature in range(len(training_set.drop(labels=droplabels, axis=1).columns.values)):
    print("%d. %s (%f)" % (feature + 1, training_set.drop(labels=droplabels, axis=1).columns.values[indices[feature]], importances[indices[feature]]))

# Measures of fit
print("-----")
print("R-squared on training set (vs OOB sample): " + str(randomforest.oob_score_))

# Turn random forest age predictions into a series
RF_ages = pd.Series(randomforest.predict(train.drop(labels=droplabels, axis=1)))
RF_ages.name = "RF_AGE"

# Make a dataframe with measured and RF-predicted ages
RF_preds = pd.concat([train.COND_AGE.reset_index(), RF_ages], axis = 1)

# Calculate RMSE for various age ranges and print it
# this is, on average, how close we are to the actual age (in years)
def RMSE(measured, predicted):
    return sqrt(mean_squared_error(measured,predicted))

print("-----")
print("RMSE for training set: " + str(RMSE(RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh], RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh])))
print("Overall RMSE " + str(RMSE(RF_preds.COND_AGE, RF_preds.RF_AGE)))

AgeRanges = [[0,10],[10,25],[25,50],[50,75],[75,100],[100,150],[150,200],[200,300]]
for agerange in AgeRanges:
    print("Age Range -> " +str(agerange)),
    print("RMSE " + str(RMSE(
            RF_preds.COND_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])],
            RF_preds.RF_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])])))


Feature ranking:
1. SumOfVOLCFGRS (0.273224)
2. BALIVE (0.248918)
3. STND_STRUC_CD_PNWRS (0.203196)
4. FORTYPCD (0.145253)
5. SITECLCD (0.129408)
-----
R-squared on training set (vs OOB sample): 0.52518601723
-----
RMSE for training set: 9.05253093511
Overall RMSE 68.3175891307
Age Range -> [0, 10] RMSE 7.86838994686
Age Range -> [10, 25] RMSE 9.40642116162
Age Range -> [25, 50] RMSE 8.74974514473
Age Range -> [50, 75] RMSE 7.63447382697
Age Range -> [75, 100] RMSE 6.76045689028
Age Range -> [100, 150] RMSE 12.4997360134
Age Range -> [150, 200] RMSE 71.965085254
Age Range -> [200, 300] RMSE 135.457754442

In [16]:
# Data to plot
x = RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh]
y = RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh]

plt.figure()
plt.grid(True)
plt.scatter(x, y, alpha = 0.03, facecolors='none', edgecolors='r')
plt.title("How well does Random Forest predict stand ages?")
plt.xlabel("Measured Stand Age [yrs]")
plt.ylabel("Predicted Stand Age [yrs]")
plt.xlim(0,age_thresh)
plt.ylim(0,age_thresh)

# calculate and plot bestfit line with np.polyfit
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, 'r--', alpha = 0.65)

# add a 1:1 line
plt.plot([0, 350], [0, 350], 'k--')

plt.show()



In [17]:
# Predict ages for conditions without ages

# dataframe of conditions without ages, but have all predictor variables
noAges = noNaNs.loc[noNaNs.COND_AGE != noNaNs.COND_AGE]

# RF predictions of ages for conditions without measured ages
RFpred_noAges = pd.Series(randomforest.predict(noAges.drop(labels=droplabels, axis=1))).astype('int')
RFpred_noAges.name = "RF_Age"

# bring predicted ages into the dataframe
RFages_imputed = pd.concat([noAges.reset_index(), RFpred_noAges], axis = 1).set_index("COND_CN")

In [18]:
# Overwrite COND_AGE with RF_Age on PNW FIA DB 2011 dataframe
print(str(len(PNW_FIADB)) + " conditions total")
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing before impute")
PNW_FIADB.COND_AGE.loc[PNW_FIADB.COND_AGE != PNW_FIADB.COND_AGE] = RFages_imputed["RF_Age"]
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing after impute")
print(str(len(PNW_FIADB)-PNW_FIADB.COND_AGE.isnull().sum()) + " conditions with ages available")


18844 conditions total
838 ages missing before impute
12 ages missing after impute
18832 conditions with ages available
C:\Users\ddiaz\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\indexing.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [19]:
PNW_FIADB.head()


Out[19]:
PLT_CN LAT LON COND_AGE FORTYPCD FORTYPE BALIVE ALSTK GSSTK SITECLCD SLOPE ASPECT ELEV SOIL_ROOTING_DEPTH_PNW STND_COND_CD_PNWRS STND_STRUC_CD_PNWRS PHYSCLCD SumOfVOLCFGRS
COND_CN
1150348010901 5931296010901 48.37 -122.11 60 301 Western hemlock 242.81 74.53 71.03 3 35 101 2300 2 4 1 23 5168.83
1150352010901 5931423010901 48.40 -121.19 310 301 Western hemlock 324.08 83.82 83.82 4 95 265 3700 1 7 3 23 23132.74
1150542010901 5936567010901 45.56 -116.65 85 201 Douglas-fir 108.47 38.55 38.55 5 49 288 5900 1 5 3 12 2589.52
1155367010901 5930505010901 38.49 -122.36 80 924 Blue oak 86.79 69.07 69.07 7 40 135 500 2 5 1 22 1535.75
1157874010901 5952962010901 40.71 -121.42 60 371 California mixed conifer 179.40 53.57 53.57 4 40 325 4100 1 4 3 19 565.55

In [21]:
# write condition unique ID, LAT, LON, slope, aspect, elevation, and stand age to a csv
# based on idb_summary format used in Forest Planner

PNW_FIADB2011_cond_summary = PNW_FIADB[["LAT", "LON", "ELEV", "ASPECT", "SLOPE", "COND_AGE"]].dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)
PNW_FIADB2011_cond_summary[["COND_AGE"]] = PNW_FIADB2011_cond_summary[["COND_AGE"]].astype(int)
PNW_FIADB2011_cond_summary.to_csv("G:/projects/ForestPlanner_2015/Data/Processed/PNWFIADB_COND_SUMMARY_ages-imputed_2015-11-02.csv", header = True, index = True, float_format='%.3f', encoding='utf-8')