In [1]:
# A script to impute missing values for for stand age in forest inventory plots
In [2]:
# load our libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt
In [3]:
# read in the PNW FIA Database, with plot("condition")-level summary data
PNW_FIADB = pd.read_csv("G:/projects/ForestPlanner_2015/Data/Work/IDB_SUMMARY_2015-10-23.csv")
PNW_FIADB.head()
Out[3]:
In [4]:
# rename a few columns
PNW_FIADB.rename(columns={'AGE_Calc': 'COND_AGE', 'LATITUDE_FUZZ': 'LAT', 'LONGITUDE_FUZZ': 'LON',
'Calc_SLOPE': 'SLOPE', 'Calc_ASPECT': 'ASPECT', 'ELEV_FT': 'ELEV'}, inplace=True)
In [5]:
# set thte COND_CN field as the dataframe index
PNW_FIADB.set_index("COND_ID", inplace = True)
In [6]:
PNW_FIADB.describe()
Out[6]:
In [7]:
# take a look at the histogram of ages
PNW_FIADB.COND_AGE.hist(bins=50)
Out[7]:
In [8]:
# some of our columns need to be formatted as factors/categorical variables
# FORTYPCD (Forest Type), SITECLCD (Site Class), SOIL_ROOTING_DEPTH_PNW,
# STND_COND_CD_PNWRS (Stand condition code, e.g., grass-forb, open sawtimber, closed sapling-pole-sawtimber, old-growth)
# STND_STRUC_CD_PNWRS (Stand Structure, e.g., even-aged single story, two-story, uneven-aged, mosaic),
# PHYSCLCD (Soil-Climate Type)
cat_cols = ["STAND_SIZE_CLASS", "SITE_CLASS_FIA", "FOR_TYPE", "FOR_TYPE_SECDRY"]
for col in cat_cols:
if col in ["FOR_TYPE", "FOR_TYPE_SECDRY"]:
PNW_FIADB[col] = PNW_FIADB[col].astype('category')
else:
PNW_FIADB[col] = PNW_FIADB[col].astype('category').cat.as_ordered()
In [9]:
# How many missing values do we have
print("Out of " + str(len(PNW_FIADB)) + " plots:")
# create a list of fields (other than age) with nulls
hasNaNs = []
for col in PNW_FIADB.columns.values:
print(col + ": " + str(PNW_FIADB[col].isnull().sum()) + " nulls")
if col != "COND_AGE" and PNW_FIADB[col].isnull().sum() >0:
hasNaNs.append(col)
print(hasNaNs)
In [10]:
# See how many plots with missing ages also have missing nulls in other columns
for col in hasNaNs:
print(col, str(PNW_FIADB.COND_AGE.loc[PNW_FIADB[col].isnull()].isnull().sum()))
In [11]:
# columns of important predictor variables
dropNaNcols = ["STAND_SIZE_CLASS", "QMD_TOT_CM", "SumOfBA_FT2_AC", "SumOfBIOM_TR_ABV_GRND_TON", "SumOfVOL_AC_GRS_FT3"]
In [12]:
# We could try to fill in missing values for some predictor variables, or just ignore them
# Let's see whether random forest thinks these values are helpful when we train the model
# on a dataset that drops the plots where values are not missing
# and also drops the columns that aren't predictors (PLT_CN, FORTYPE)
noNaNs = PNW_FIADB.dropna(axis=0, how='any', thresh=None, subset=dropNaNcols, inplace=False)
In [13]:
# Number of plots with complete predictor variables
print(str(len(noNaNs))+ " plots with complete predictor variables")
In [14]:
# Create a random forest training set of data from all plots with ages
train = noNaNs.dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)
In [15]:
# set parameters for random forest regression
randomforest = RandomForestRegressor(n_estimators = 100, oob_score=True, random_state = 54321)
In [16]:
# train randomforest on plots within the age range we care about
# return the ranked feature importances for that random forest model
# use that model to predict the ages for all plots where ages are known
# (including those outside the age range used to train the model)
# set which data we're using to train the model
# "train" includes all conditions with measured ages and all predictor variables
age_thresh = 150
training_set = train.loc[train.COND_AGE < age_thresh]
# Drop columns of variables that don't seem important as predictors
droplabels = ["SLOPE", "ASPECT", "ELEV", "FOR_TYPE_SECDRY", "MAI", "SITE_CLASS_FIA"]
droplabels.append("COND_AGE")
droplabels.append("PLOT_ID")
droplabels.append("FIA_FOREST_TYPE_NAME")
droplabels.append("GLC_GROUP")
X, y = training_set.drop(labels=droplabels, axis=1), training_set["COND_AGE"]
# train a random forest on the data subset with known ages
randomforest.fit(X,y)
# Gather the feature importances
importances = randomforest.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for feature in range(len(training_set.drop(labels=droplabels, axis=1).columns.values)):
print("%d. %s (%f)" % (feature + 1, training_set.drop(labels=droplabels, axis=1).columns.values[indices[feature]], importances[indices[feature]]))
# Measures of fit
print("-----")
print("R-squared on training set (vs OOB sample): " + str(randomforest.oob_score_))
# Turn random forest age predictions into a series
RF_ages = pd.Series(randomforest.predict(train.drop(labels=droplabels, axis=1)))
RF_ages.name = "RF_AGE"
# Make a dataframe with measured and RF-predicted ages
RF_preds = pd.concat([train.COND_AGE.reset_index(), RF_ages], axis = 1)
# Calculate RMSE for various age ranges and print it
# this is, on average, how close we are to the actual age (in years)
def RMSE(measured, predicted):
return sqrt(mean_squared_error(measured,predicted))
print("-----")
print("RMSE for training set: " + str(RMSE(RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh], RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh])))
print("Overall RMSE " + str(RMSE(RF_preds.COND_AGE, RF_preds.RF_AGE)))
AgeRanges = [[0,10],[10,25],[25,50],[50,75],[75,100],[100,150],[150,200],[200,300]]
for agerange in AgeRanges:
print("Age Range -> " +str(agerange)),
print("RMSE " + str(RMSE(
RF_preds.COND_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])],
RF_preds.RF_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])])))
In [17]:
# Data to plot
x = RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh]
y = RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh]
plt.figure()
plt.grid(True)
plt.scatter(x, y, alpha = 0.03, facecolors='none', edgecolors='r')
plt.title("How well does Random Forest predict stand ages?")
plt.xlabel("Measured Stand Age [yrs]")
plt.ylabel("Predicted Stand Age [yrs]")
plt.xlim(0,age_thresh)
plt.ylim(0,age_thresh)
# calculate and plot bestfit line with np.polyfit
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, 'r--', alpha = 0.65)
# add a 1:1 line
plt.plot([0, 350], [0, 350], 'k--')
plt.show()
In [18]:
# Predict ages for conditions without ages
# dataframe of conditions without ages, but have all predictor variables
noAges = noNaNs.loc[noNaNs.COND_AGE != noNaNs.COND_AGE]
# RF predictions of ages for conditions without measured ages
RFpred_noAges = pd.Series(randomforest.predict(noAges.drop(labels=droplabels, axis=1))).astype('int')
RFpred_noAges.name = "RF_Age"
# bring predicted ages into the dataframe
RFages_imputed = pd.concat([noAges.reset_index(), RFpred_noAges], axis = 1).set_index("COND_ID")
In [19]:
# Overwrite COND_AGE with RF_Age on PNW FIA DB 2011 dataframe
print(str(len(PNW_FIADB)) + " conditions total")
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing before impute")
PNW_FIADB.COND_AGE.loc[PNW_FIADB.COND_AGE != PNW_FIADB.COND_AGE] = RFages_imputed["RF_Age"]
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing after impute")
print(str(len(PNW_FIADB)-PNW_FIADB.COND_AGE.isnull().sum()) + " conditions with ages available")
In [20]:
# write condition unique ID, LAT, LON, slope, aspect, elevation, and stand age to a csv
# based on idb_summary format used in Forest Planner
PNW_FIADB2011_cond_summary = PNW_FIADB[["LAT", "LON", "ELEV", "ASPECT", "SLOPE", "COND_AGE"]].dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)
PNW_FIADB2011_cond_summary[["COND_AGE"]] = PNW_FIADB2011_cond_summary[["COND_AGE"]].astype(int)
PNW_FIADB2011_cond_summary.to_csv("G:/projects/ForestPlanner_2015/Data/Processed/IDB2pt0_COND_SUMMARY_ages-imputed_2015-11-02.csv", header = True, index = True)