notebook.community

Edit and run



In [1]:

    
# A script to impute missing values for for stand age in forest inventory plots
# Reads in plot-level summary data of PNWFIADB 2011 Database
# These plot-level summaries were exported as a .csv from the Access Database



In [2]:

    
# load our libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from math import sqrt



In [3]:

    
# read in the PNW FIA Database, with plot("condition")-level summary data
PNW_FIADB = pd.read_csv("G:/projects/ForestPlanner_2015/Data/Work/PNWFIADB_COND_SUMMARY_2015-10-27.csv")
PNW_FIADB.head()









    Out[3]:






  
    
      
      COND_CN
      PLT_CN
      LAT
      LON
      COND_AGE
      FORTYPCD
      FORTYPE
      BALIVE
      ALSTK
      GSSTK
      SITECLCD
      SLOPE
      ASPECT
      ELEV
      SOIL_ROOTING_DEPTH_PNW
      STND_COND_CD_PNWRS
      STND_STRUC_CD_PNWRS
      PHYSCLCD
      SumOfVOLCFGRS
    
  
  
    
      0
      1150348010901
      5931296010901
      48.37
      -122.11
      60
      301
      Western hemlock
      242.81
      74.53
      71.03
      3
      35
      101
      2300
      2
      4
      1
      23
      5168.83
    
    
      1
      1150352010901
      5931423010901
      48.40
      -121.19
      310
      301
      Western hemlock
      324.08
      83.82
      83.82
      4
      95
      265
      3700
      1
      7
      3
      23
      23132.74
    
    
      2
      1150542010901
      5936567010901
      45.56
      -116.65
      85
      201
      Douglas-fir
      108.47
      38.55
      38.55
      5
      49
      288
      5900
      1
      5
      3
      12
      2589.52
    
    
      3
      1155367010901
      5930505010901
      38.49
      -122.36
      80
      924
      Blue oak
      86.79
      69.07
      69.07
      7
      40
      135
      500
      2
      5
      1
      22
      1535.75
    
    
      4
      1157874010901
      5952962010901
      40.71
      -121.42
      60
      371
      California mixed conifer
      179.40
      53.57
      53.57
      4
      40
      325
      4100
      1
      4
      3
      19
      565.55



In [4]:

    
# set thte COND_CN field as the dataframe index
PNW_FIADB.set_index("COND_CN", inplace = True)



In [5]:

    
PNW_FIADB.describe()









    Out[5]:






  
    
      
      PLT_CN
      LAT
      LON
      COND_AGE
      FORTYPCD
      BALIVE
      ALSTK
      GSSTK
      SITECLCD
      SLOPE
      ASPECT
      ELEV
      SOIL_ROOTING_DEPTH_PNW
      STND_COND_CD_PNWRS
      STND_STRUC_CD_PNWRS
      PHYSCLCD
      SumOfVOLCFGRS
    
  
  
    
      count
      1.884400e+04
      18844.000000
      18844.000000
      18006.000000
      18844.000000
      18844.000000
      18844.000000
      18844.000000
      18844.000000
      18844.000000
      18844.000000
      18844.000000
      16894.000000
      15715.000000
      16894.000000
      18844.000000
      18558.000000
    
    
      mean
      2.642437e+13
      44.512426
      -123.143173
      101.070921
      414.833793
      140.879385
      55.437669
      53.445928
      4.763744
      31.479516
      158.255678
      3227.579070
      1.704451
      4.075469
      2.114893
      20.392167
      6720.808023
    
    
      std
      1.076066e+13
      5.762961
      6.191382
      86.511111
      286.535314
      269.981017
      28.179725
      28.786750
      1.678542
      24.893576
      114.819885
      2225.768016
      0.456303
      1.427482
      0.977156
      4.600012
      12049.774469
    
    
      min
      5.930347e+12
      32.670000
      -154.000000
      0.000000
      122.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      1.000000
      11.000000
      0.810000
    
    
      25%
      2.293178e+13
      40.710000
      -123.380000
      45.000000
      221.000000
      51.505000
      33.660000
      31.557500
      3.000000
      10.000000
      50.000000
      1300.000000
      1.000000
      3.000000
      1.000000
      19.000000
      654.190000
    
    
      50%
      2.413435e+13
      43.910000
      -121.860000
      80.000000
      281.000000
      113.380000
      53.670000
      52.040000
      5.000000
      26.000000
      156.000000
      3100.000000
      2.000000
      4.000000
      2.000000
      22.000000
      2368.650000
    
    
      75%
      2.988225e+13
      47.100000
      -120.340000
      125.000000
      371.000000
      198.272500
      75.110000
      73.610000
      6.000000
      50.000000
      260.000000
      4900.000000
      2.000000
      5.000000
      3.000000
      23.000000
      7056.775000
    
    
      max
      6.349610e+13
      61.450000
      -114.150000
      997.000000
      999.000000
      31072.230000
      126.660000
      126.660000
      7.000000
      155.000000
      360.000000
      11700.000000
      2.000000
      7.000000
      4.000000
      39.000000
      350734.050000



In [6]:

    
# take a look at the histogram of ages
PNW_FIADB.COND_AGE.hist(bins=50)









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x1f0c2860>



In [7]:

    
# some of our columns need to be formatted as factors/categorical variables
# FORTYPCD (Forest Type), SITECLCD (Site Class), SOIL_ROOTING_DEPTH_PNW, 
# STND_COND_CD_PNWRS (Stand condition code, e.g., grass-forb, open sawtimber, closed sapling-pole-sawtimber, old-growth)
# STND_STRUC_CD_PNWRS (Stand Structure, e.g., even-aged single story, two-story, uneven-aged, mosaic), 
# PHYSCLCD (Soil-Climate Type)
cat_cols = ["FORTYPCD", "FORTYPE", "SOIL_ROOTING_DEPTH_PNW", "SITECLCD", "STND_COND_CD_PNWRS", "STND_STRUC_CD_PNWRS", "PHYSCLCD"]
for col in cat_cols:
    if col in ["PHYSCLCD", "FORTYPCD", "FORTYPE"]:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category')
    else:
        PNW_FIADB[col] = PNW_FIADB[col].astype('category').cat.as_ordered()



In [8]:

    
# How many missing values do we have
print("Out of " + str(len(PNW_FIADB)) + " plots:")

# create a list of fields (other than age) with nulls
hasNaNs = []

for col in PNW_FIADB.columns.values:
    print(col + ": " + str(PNW_FIADB[col].isnull().sum()) + " nulls")
    if col != "COND_AGE" and PNW_FIADB[col].isnull().sum() >0:
        hasNaNs.append(col)
print(hasNaNs)









    



Out of 18844 plots:
PLT_CN: 0 nulls
LAT: 0 nulls
LON: 0 nulls
COND_AGE: 838 nulls
FORTYPCD: 0 nulls
FORTYPE: 0 nulls
BALIVE: 0 nulls
ALSTK: 0 nulls
GSSTK: 0 nulls
SITECLCD: 0 nulls
SLOPE: 0 nulls
ASPECT: 0 nulls
ELEV: 0 nulls
SOIL_ROOTING_DEPTH_PNW: 1950 nulls
STND_COND_CD_PNWRS: 3129 nulls
STND_STRUC_CD_PNWRS: 1950 nulls
PHYSCLCD: 0 nulls
SumOfVOLCFGRS: 286 nulls
['SOIL_ROOTING_DEPTH_PNW', 'STND_COND_CD_PNWRS', 'STND_STRUC_CD_PNWRS', 'SumOfVOLCFGRS']



In [9]:

    
# See how many plots with missing ages also have missing nulls in other columns
for col in hasNaNs:
    print(col, str(PNW_FIADB.COND_AGE.loc[PNW_FIADB[col].isnull()].isnull().sum()))









    



('SOIL_ROOTING_DEPTH_PNW', '3')
('STND_COND_CD_PNWRS', '117')
('STND_STRUC_CD_PNWRS', '3')
('SumOfVOLCFGRS', '9')



In [10]:

    
# columns of important predictor variables
dropNaNcols = ["SumOfVOLCFGRS", "STND_STRUC_CD_PNWRS"]



In [11]:

    
# We could try to fill in missing values for some predictor variables, or just ignore them
# Let's see whether random forest thinks these values are helpful when we train the model 
# on a dataset that drops the plots where values are not missing
# and also drops the columns that aren't predictors (PLT_CN, FORTYPE)
noNaNs = PNW_FIADB.dropna(axis=0, how='any', thresh=None, subset=dropNaNcols, inplace=False)



In [12]:

    
# Number of plots with complete predictor variables
print(str(len(noNaNs))+ " plots with complete predictor variables")









    



16655plots with complete predictor variables



In [13]:

    
# Create a random forest training set of data from all plots with ages
train = noNaNs.dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)



In [14]:

    
# set parameters for random forest regression
randomforest = RandomForestRegressor(n_estimators = 100, oob_score=True, random_state = 54321)



In [15]:

    
# train randomforest on plots within the age range we care about
# return the ranked feature importances for that random forest model
# use that model to predict the ages for all plots where ages are known 
# (including those outside the age range used to train the model)

# set which data we're using to train the model 
# "train" includes all conditions with measured ages and all predictor variables
age_thresh = 150
training_set = train.loc[train.COND_AGE < age_thresh]

# Drop columns of variables that don't seem important as predictors
droplabels = ["SOIL_ROOTING_DEPTH_PNW", "PHYSCLCD", "SLOPE", "ALSTK", "GSSTK", "ASPECT", "ELEV", "LAT", "LON", "STND_COND_CD_PNWRS"]
droplabels.append("COND_AGE")
droplabels.append("PLT_CN")
droplabels.append("FORTYPE")
X, y = training_set.drop(labels=droplabels, axis=1), training_set["COND_AGE"]

# train a random forest on the data subset with known ages
randomforest.fit(X,y)

# Gather the feature importances
importances = randomforest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for feature in range(len(training_set.drop(labels=droplabels, axis=1).columns.values)):
    print("%d. %s (%f)" % (feature + 1, training_set.drop(labels=droplabels, axis=1).columns.values[indices[feature]], importances[indices[feature]]))

# Measures of fit
print("-----")
print("R-squared on training set (vs OOB sample): " + str(randomforest.oob_score_))

# Turn random forest age predictions into a series
RF_ages = pd.Series(randomforest.predict(train.drop(labels=droplabels, axis=1)))
RF_ages.name = "RF_AGE"

# Make a dataframe with measured and RF-predicted ages
RF_preds = pd.concat([train.COND_AGE.reset_index(), RF_ages], axis = 1)

# Calculate RMSE for various age ranges and print it
# this is, on average, how close we are to the actual age (in years)
def RMSE(measured, predicted):
    return sqrt(mean_squared_error(measured,predicted))

print("-----")
print("RMSE for training set: " + str(RMSE(RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh], RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh])))
print("Overall RMSE " + str(RMSE(RF_preds.COND_AGE, RF_preds.RF_AGE)))

AgeRanges = [[0,10],[10,25],[25,50],[50,75],[75,100],[100,150],[150,200],[200,300]]
for agerange in AgeRanges:
    print("Age Range -> " +str(agerange)),
    print("RMSE " + str(RMSE(
            RF_preds.COND_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])],
            RF_preds.RF_AGE.loc[(RF_preds.COND_AGE >= agerange[0]) & (RF_preds.COND_AGE < agerange[1])])))









    



Feature ranking:
1. SumOfVOLCFGRS (0.273224)
2. BALIVE (0.248918)
3. STND_STRUC_CD_PNWRS (0.203196)
4. FORTYPCD (0.145253)
5. SITECLCD (0.129408)
-----
R-squared on training set (vs OOB sample): 0.52518601723
-----
RMSE for training set: 9.05253093511
Overall RMSE 68.3175891307
Age Range -> [0, 10] RMSE 7.86838994686
Age Range -> [10, 25] RMSE 9.40642116162
Age Range -> [25, 50] RMSE 8.74974514473
Age Range -> [50, 75] RMSE 7.63447382697
Age Range -> [75, 100] RMSE 6.76045689028
Age Range -> [100, 150] RMSE 12.4997360134
Age Range -> [150, 200] RMSE 71.965085254
Age Range -> [200, 300] RMSE 135.457754442



In [16]:

    
# Data to plot
x = RF_preds.COND_AGE.loc[RF_preds.COND_AGE < age_thresh]
y = RF_preds.RF_AGE.loc[RF_preds.COND_AGE < age_thresh]

plt.figure()
plt.grid(True)
plt.scatter(x, y, alpha = 0.03, facecolors='none', edgecolors='r')
plt.title("How well does Random Forest predict stand ages?")
plt.xlabel("Measured Stand Age [yrs]")
plt.ylabel("Predicted Stand Age [yrs]")
plt.xlim(0,age_thresh)
plt.ylim(0,age_thresh)

# calculate and plot bestfit line with np.polyfit
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, 'r--', alpha = 0.65)

# add a 1:1 line
plt.plot([0, 350], [0, 350], 'k--')

plt.show()



In [17]:

    
# Predict ages for conditions without ages

# dataframe of conditions without ages, but have all predictor variables
noAges = noNaNs.loc[noNaNs.COND_AGE != noNaNs.COND_AGE]

# RF predictions of ages for conditions without measured ages
RFpred_noAges = pd.Series(randomforest.predict(noAges.drop(labels=droplabels, axis=1))).astype('int')
RFpred_noAges.name = "RF_Age"

# bring predicted ages into the dataframe
RFages_imputed = pd.concat([noAges.reset_index(), RFpred_noAges], axis = 1).set_index("COND_CN")



In [18]:

    
# Overwrite COND_AGE with RF_Age on PNW FIA DB 2011 dataframe
print(str(len(PNW_FIADB)) + " conditions total")
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing before impute")
PNW_FIADB.COND_AGE.loc[PNW_FIADB.COND_AGE != PNW_FIADB.COND_AGE] = RFages_imputed["RF_Age"]
print(str(PNW_FIADB.COND_AGE.isnull().sum()) + " ages missing after impute")
print(str(len(PNW_FIADB)-PNW_FIADB.COND_AGE.isnull().sum()) + " conditions with ages available")









    



18844 conditions total
838 ages missing before impute
12 ages missing after impute
18832 conditions with ages available






    



C:\Users\ddiaz\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\indexing.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [19]:

    
PNW_FIADB.head()









    Out[19]:






  
    
      
      PLT_CN
      LAT
      LON
      COND_AGE
      FORTYPCD
      FORTYPE
      BALIVE
      ALSTK
      GSSTK
      SITECLCD
      SLOPE
      ASPECT
      ELEV
      SOIL_ROOTING_DEPTH_PNW
      STND_COND_CD_PNWRS
      STND_STRUC_CD_PNWRS
      PHYSCLCD
      SumOfVOLCFGRS
    
    
      COND_CN
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1150348010901
      5931296010901
      48.37
      -122.11
      60
      301
      Western hemlock
      242.81
      74.53
      71.03
      3
      35
      101
      2300
      2
      4
      1
      23
      5168.83
    
    
      1150352010901
      5931423010901
      48.40
      -121.19
      310
      301
      Western hemlock
      324.08
      83.82
      83.82
      4
      95
      265
      3700
      1
      7
      3
      23
      23132.74
    
    
      1150542010901
      5936567010901
      45.56
      -116.65
      85
      201
      Douglas-fir
      108.47
      38.55
      38.55
      5
      49
      288
      5900
      1
      5
      3
      12
      2589.52
    
    
      1155367010901
      5930505010901
      38.49
      -122.36
      80
      924
      Blue oak
      86.79
      69.07
      69.07
      7
      40
      135
      500
      2
      5
      1
      22
      1535.75
    
    
      1157874010901
      5952962010901
      40.71
      -121.42
      60
      371
      California mixed conifer
      179.40
      53.57
      53.57
      4
      40
      325
      4100
      1
      4
      3
      19
      565.55



In [21]:

    
# write condition unique ID, LAT, LON, slope, aspect, elevation, and stand age to a csv
# based on idb_summary format used in Forest Planner

PNW_FIADB2011_cond_summary = PNW_FIADB[["LAT", "LON", "ELEV", "ASPECT", "SLOPE", "COND_AGE"]].dropna(axis=0, how='any', thresh=None, subset=["COND_AGE"], inplace=False)
PNW_FIADB2011_cond_summary[["COND_AGE"]] = PNW_FIADB2011_cond_summary[["COND_AGE"]].astype(int)
PNW_FIADB2011_cond_summary.to_csv("G:/projects/ForestPlanner_2015/Data/Processed/PNWFIADB_COND_SUMMARY_ages-imputed_2015-11-02.csv", header = True, index = True, float_format='%.3f', encoding='utf-8')

	COND_CN	PLT_CN	LAT	LON	COND_AGE	FORTYPCD	FORTYPE	BALIVE	ALSTK	GSSTK	SITECLCD	SLOPE	ASPECT	ELEV	SOIL_ROOTING_DEPTH_PNW	STND_COND_CD_PNWRS	STND_STRUC_CD_PNWRS	PHYSCLCD	SumOfVOLCFGRS
0	1150348010901	5931296010901	48.37	-122.11	60	301	Western hemlock	242.81	74.53	71.03	3	35	101	2300	2	4	1	23	5168.83
1	1150352010901	5931423010901	48.40	-121.19	310	301	Western hemlock	324.08	83.82	83.82	4	95	265	3700	1	7	3	23	23132.74
2	1150542010901	5936567010901	45.56	-116.65	85	201	Douglas-fir	108.47	38.55	38.55	5	49	288	5900	1	5	3	12	2589.52
3	1155367010901	5930505010901	38.49	-122.36	80	924	Blue oak	86.79	69.07	69.07	7	40	135	500	2	5	1	22	1535.75
4	1157874010901	5952962010901	40.71	-121.42	60	371	California mixed conifer	179.40	53.57	53.57	4	40	325	4100	1	4	3	19	565.55

	PLT_CN	LAT	LON	COND_AGE	FORTYPCD	BALIVE	ALSTK	GSSTK	SITECLCD	SLOPE	ASPECT	ELEV	SOIL_ROOTING_DEPTH_PNW	STND_COND_CD_PNWRS	STND_STRUC_CD_PNWRS	PHYSCLCD	SumOfVOLCFGRS
count	1.884400e+04	18844.000000	18844.000000	18006.000000	18844.000000	18844.000000	18844.000000	18844.000000	18844.000000	18844.000000	18844.000000	18844.000000	16894.000000	15715.000000	16894.000000	18844.000000	18558.000000
mean	2.642437e+13	44.512426	-123.143173	101.070921	414.833793	140.879385	55.437669	53.445928	4.763744	31.479516	158.255678	3227.579070	1.704451	4.075469	2.114893	20.392167	6720.808023
std	1.076066e+13	5.762961	6.191382	86.511111	286.535314	269.981017	28.179725	28.786750	1.678542	24.893576	114.819885	2225.768016	0.456303	1.427482	0.977156	4.600012	12049.774469
min	5.930347e+12	32.670000	-154.000000	0.000000	122.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	11.000000	0.810000
25%	2.293178e+13	40.710000	-123.380000	45.000000	221.000000	51.505000	33.660000	31.557500	3.000000	10.000000	50.000000	1300.000000	1.000000	3.000000	1.000000	19.000000	654.190000
50%	2.413435e+13	43.910000	-121.860000	80.000000	281.000000	113.380000	53.670000	52.040000	5.000000	26.000000	156.000000	3100.000000	2.000000	4.000000	2.000000	22.000000	2368.650000
75%	2.988225e+13	47.100000	-120.340000	125.000000	371.000000	198.272500	75.110000	73.610000	6.000000	50.000000	260.000000	4900.000000	2.000000	5.000000	3.000000	23.000000	7056.775000
max	6.349610e+13	61.450000	-114.150000	997.000000	999.000000	31072.230000	126.660000	126.660000	7.000000	155.000000	360.000000	11700.000000	2.000000	7.000000	4.000000	39.000000	350734.050000

	PLT_CN	LAT	LON	COND_AGE	FORTYPCD	FORTYPE	BALIVE	ALSTK	GSSTK	SITECLCD	SLOPE	ASPECT	ELEV	SOIL_ROOTING_DEPTH_PNW	STND_COND_CD_PNWRS	STND_STRUC_CD_PNWRS	PHYSCLCD	SumOfVOLCFGRS
COND_CN
1150348010901	5931296010901	48.37	-122.11	60	301	Western hemlock	242.81	74.53	71.03	3	35	101	2300	2	4	1	23	5168.83
1150352010901	5931423010901	48.40	-121.19	310	301	Western hemlock	324.08	83.82	83.82	4	95	265	3700	1	7	3	23	23132.74
1150542010901	5936567010901	45.56	-116.65	85	201	Douglas-fir	108.47	38.55	38.55	5	49	288	5900	1	5	3	12	2589.52
1155367010901	5930505010901	38.49	-122.36	80	924	Blue oak	86.79	69.07	69.07	7	40	135	500	2	5	1	22	1535.75
1157874010901	5952962010901	40.71	-121.42	60	371	California mixed conifer	179.40	53.57	53.57	4	40	325	4100	1	4	3	19	565.55