In [1]:
# Import tree-level data from PNW_FIADB2011 forest inventory plots
# Summarize the data into 2" diameter classes
# Export a csv that has the total trees per acre by each species and 2" diameter class
# for each "condition" (~inventory plot)

In [2]:
# import the packages
import pandas as pd
import numpy as np
import csv
import time
import json

In [3]:
# read in the treelists for the PNW_FIADB2011 dataset and IDBv2.0 dataset
PNWFIADB2011 = pd.read_csv("G:\projects\ForestPlanner_2015\Data\Processed\PNW_FIADB2011_TREELIST_2015-11-06.csv")
IDB2pt0 = pd.read_csv("G:\projects\ForestPlanner_2015\Data\Processed\IDB2pt0_TREELIST_2015-11-04.csv")

In [5]:
print str(PNWFIADB2011.COND_ID.nunique()) + " unique PNWFIADB 2011 conditions"
print str(IDB2pt0.COND_ID.nunique()) + " unique IDB 2.0 conditions"


18845 unique PNWFIADB 2011 conditions
24347 unique IDB 2.0 conditions

In [6]:
# combine these datasets into one master treelist
treelist = PNWFIADB2011.append(IDB2pt0)

In [7]:
treelist.head()


Out[7]:
ACTUALHT BHAGE COMMON_NAME COND_ID CR DIA GENUS HT PLOT SPCD SPECIES_SYMBOL SPGRPCD SPP_GRP STATUSCD SUBP TOTAGE TPA_UNADJ TREE_ID
0 44 NaN western redcedar 1150352010901 15 9.5 Thuja 44 71915 242 THPL 22 Western redcedar 1 2 NaN 6.01 111
1 73 318 western hemlock 1150352010901 40 32.8 Tsuga 73 71915 263 TSHE 13 Western hemlock 1 2 NaN 0.99 112
2 66 95 western hemlock 1150352010901 10 10.5 Tsuga 66 71915 263 TSHE 13 Western hemlock 1 2 NaN 6.01 113
3 45 NaN western redcedar 1150352010901 20 10.2 Thuja 45 71915 242 THPL 22 Western redcedar 1 2 NaN 6.01 114
4 47 NaN western hemlock 1150352010901 20 9.2 Tsuga 47 71915 263 TSHE 13 Western hemlock 1 2 NaN 6.01 115

In [8]:
# read in the FVS species name crosswalk table
FVS_crosswalk = pd.read_csv("G:\projects\ForestPlanner_2015\Data\work\FVS_SpeciesCrosswalk_2015-11-06.csv")

In [9]:
# drop entries in the crosswalk without species_symbols
FVS_crosswalk.dropna(subset=["SPECIES_SYMBOL"], inplace = True)

# set species_symbol as index for FVS_crosswalk
FVS_crosswalk.set_index("SPECIES_SYMBOL", inplace = True)

In [10]:
FVS_crosswalk.columns.values


Out[10]:
array(['FIA_SPCD', 'FVS_ALPHA_CODE', 'COMMON_NAME',
       'SCIENTIFIC NAME_FSVeg', 'AK', 'BM', 'CA', 'CI', 'CR', 'EC', 'EM',
       'IE', 'KT', 'NC', 'NI', 'PN', 'SO', 'TT', 'UT', 'WC', 'WS'], dtype=object)

In [11]:
# Drop columns from FVS_crosswalk we don't want or need
dropcols = ['FIA_SPCD', 'FVS_ALPHA_CODE', 'SCIENTIFIC NAME_FSVeg', 'AK', 'CI', 'CR', 'EM', 'IE', 'KT', 'NI', 'NC', 'TT', 'UT', 'WS']
FVS_crosswalk.drop(labels = dropcols, axis = 1, inplace = True)
FVS_crosswalk.head()


Out[11]:
COMMON_NAME BM CA EC PN SO WC
SPECIES_SYMBOL
ABIES Fir spp. GF WF GF SF WF SF
ABSH Shasta red fir OS SH NF RF SH RF
ABAM Pacific silver fir OS SH SF SF SF SF
ABBR Bristlecone fir OS OS WB OT OS OT
ABCO White fir GF WF WF WF WF WF

In [12]:
fvs_symbols_missing = 0
for symbol in pd.unique(treelist.SPECIES_SYMBOL):
    if symbol in FVS_crosswalk.index.tolist():
        pass
    else: 
        fvs_symbols_missing += 1
        print(symbol + "not in FVS_crosswalk")
if fvs_symbols_missing == 0:
    print("All species symbols from treelist found in FVS_crosswalk")
else:
    print('\n' "Symbols missing from FVS_crosswalk: " + str(fvs_symbols_missing))


All species symbols from treelist found in FVS_crosswalk

In [13]:
# Count number of unique values of columns in FVS_crosswalk for all species symbols found in treelist
# this shows how much mapping species_symbol to any of these columns will consolidate the number of options

# first do it on the index of fvs_crosswalk (species symbols)
print("Species symbols" + ": " + str(len(pd.unique(FVS_crosswalk.loc[FVS_crosswalk.index.isin(pd.unique(treelist.SPECIES_SYMBOL))].index))))

# then on all the other columns
for col in FVS_crosswalk.columns.values:
    print(col + ": " + str(len(pd.unique(FVS_crosswalk[col].loc[FVS_crosswalk.index.isin(pd.unique(treelist.SPECIES_SYMBOL))]))))


Species symbols: 118
COMMON_NAME: 117
BM: 18
CA: 49
EC: 32
PN: 38
SO: 32
WC: 37

In [14]:
# Over-write the common names in the treelist using the values from FVS_crosswalk
print(str(len(pd.unique(treelist.COMMON_NAME))) + " common_names before")
for symbol in pd.unique(treelist.SPECIES_SYMBOL):
    treelist.COMMON_NAME.loc[treelist.SPECIES_SYMBOL == symbol] = FVS_crosswalk.COMMON_NAME.loc[symbol]
print(str(len(pd.unique(treelist.COMMON_NAME))) + " common_names after")


170 common_names before
117 common_names after
C:\Users\ddiaz\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\indexing.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [15]:
# create nested dictionary for all entries, top-level key is common_name
species_crosswalk = {}

# all common names set up as keys with empty nested dictionary for value
species_crosswalk = {common_name:{} for common_name in pd.unique(FVS_crosswalk['COMMON_NAME'])}

# for each common name (key) in the dictionary, fill in the values...
for key in species_crosswalk:
    # create a nested dictionary where the name is the FVS_crosswalk column name, and the value is empty
    species_crosswalk[key] = {column:'' for column in FVS_crosswalk.columns.values}

# for each common name (key) in the species_crosswalk dictionary, fill in the values...
for key in species_crosswalk:
    # for each nested dictionary (which have FVS_crosswalk column names as subkeys)...
    for subkey in species_crosswalk[key]:
        # set the value for each nested dictionary based on the FVS_crosswalk table
        species_crosswalk[key][subkey] = FVS_crosswalk[subkey].loc[FVS_crosswalk.COMMON_NAME == key].values[0]

In [16]:
# dump the species_crosswalk to a json file
with open('G:\projects\ForestPlanner_2015\Data\Processed\FVS_species_crosswalk_' + time.strftime("%Y-%m-%d") + '.json', 'w') as dumpsite:
    json.dump(species_crosswalk, dumpsite)

In [17]:
print str(treelist.COND_ID.nunique()) + " unique conditions (~inventory plots)"
print str(len(treelist)) + " trees"


43192 unique conditions (~inventory plots)
2002912 trees

In [18]:
# Set COND_ID as index
treelist.set_index("COND_ID", inplace = True)

In [19]:
print(str(len(pd.unique(treelist.COMMON_NAME))) + " COMMON_NAMEs")
print(str(len(pd.unique(treelist.SPECIES_SYMBOL))) + " SPECIES_SYMBOLs")
print(str(len(pd.unique(treelist.SPCD))) + " SPCDs")
print(str(len(pd.unique(treelist.GENUS))) + " GENUSes")
print(str(len(pd.unique(treelist.SPP_GRP))) + " SPP_GRPs")


117 COMMON_NAMEs
118 SPECIES_SYMBOLs
117 SPCDs
43 GENUSes
21 SPP_GRPs

In [20]:
# How many null values are there for each column?
treelist.isnull().sum()


Out[20]:
ACTUALHT             1788
BHAGE             1509368
COMMON_NAME             0
CR                 241036
DIA                   993
GENUS             1457174
HT                   7702
PLOT                    0
SPCD                    0
SPECIES_SYMBOL          0
SPGRPCD           1457174
SPP_GRP           1457174
STATUSCD                0
SUBP                    0
TOTAGE            1611032
TPA_UNADJ             993
TREE_ID                 0
dtype: int64

In [21]:
sorted(pd.unique(treelist.COMMON_NAME))


Out[21]:
['Alaska yellow-cedar',
 'American holly',
 'Apple spp.',
 'Ash spp.',
 'Balsam willow',
 'Bigcone Douglas-fir',
 'Bigleaf maple',
 'Bishop pine',
 'Bitter cherry',
 'Black cottonwood',
 'Black locust',
 'Black spruce',
 'Black willow',
 'Blue oak',
 'Boxelder',
 'Brewer spruce',
 'Bristlecone fir',
 'Bristlecone pine',
 'Buckeye, horsechestnut  ',
 'California black oak',
 'California buckeye',
 'California foothill pine',
 'California juniper',
 'California laurel',
 'California live oak',
 'California nutmeg',
 'California red fir',
 'California sycamore',
 'Canyon live oak',
 'Chinkapin oak',
 'Chokecherry',
 'Coulter pine',
 'Curl-leaf mountain mahogany',
 'Cypress spp.',
 'Desert ironwood  ',
 'Douglas-fir',
 'Engelmann oak',
 'Engelmann spruce',
 'Foxtail pine',
 'Fremont cottonwood',
 'Giant chinquapin',
 'Giant sequoia',
 'Grand fir',
 'Great Basin bristlecone pine',
 'Gum spp.',
 'Hawthorn spp.',
 'Holly',
 'Honey mesquite',
 'Incense-cedar',
 'Interior live oak',
 'Jeffrey pine',
 'Knobcone pine',
 'Limber pine',
 'Lodgepole pine',
 "MacNab's cypress",
 'Modoc cypress',
 'Monterey cypress',
 'Monterey pine',
 'Mountain hemlock',
 'Noble fir',
 'Northern California walnut',
 'Norway maple',
 'Oak, deciduous spp.',
 'Oregon ash',
 'Oregon crabapple',
 'Oregon white oak',
 'Pacific dogwood',
 'Pacific madrone',
 'Pacific silver fir',
 'Pacific yew',
 'Paper birch',
 'Plum/cherry spp.',
 'Ponderosa pine',
 'Port-Orford-cedar',
 'Quaking aspen',
 'Red alder',
 'Redwood',
 'Rocky Mountain juniper',
 'Rocky Mountain maple',
 'Russian olive',
 "Sargent's cypress",
 'Scotch pine',
 'Screwbean mesquite  ',
 'Shasta red fir',
 'Singleleaf pinyon',
 'Sitka spruce',
 'Southern California walnut',
 'Subalpine fir',
 'Subalpine larch',
 'Sugar pine',
 'Sweet cherry',
 'Sweetgum',
 'Tanoak',
 'Tasmanian bluegum',
 'Tree of heaven',
 'Tree, evergreen spp.',
 'Unknown dead hardwood',
 'Unknown softwood',
 'Unknown spp.',
 'Utah juniper',
 'Valley oak',
 'Velvet ash',
 'Walnut spp.',
 'Washoe pine',
 'Water birch',
 'Western hemlock',
 'Western juniper',
 'Western larch',
 'Western paper birch',
 'Western redcedar',
 'Western white pine',
 'White alder',
 'White fir',
 'White spruce',
 'White willow',
 'Whitebark pine',
 'Willow spp.']

In [22]:
# remove trees without a DIA or TPA_UNADJ value
treelist.dropna(axis = 0, how = 'any', subset = ["TPA_UNADJ", "DIA"], inplace = True)

In [23]:
# How many null values are there for each column now?
treelist.isnull().sum()


Out[23]:
ACTUALHT              795
BHAGE             1508540
COMMON_NAME             0
CR                 240043
DIA                     0
GENUS             1457174
HT                   6709
PLOT                    0
SPCD                    0
SPECIES_SYMBOL          0
SPGRPCD           1457174
SPP_GRP           1457174
STATUSCD                0
SUBP                    0
TOTAGE            1610039
TPA_UNADJ               0
TREE_ID                 0
dtype: int64

In [24]:
# identify the 2" diameter class for each tree
# takes in a tree DBH and returns the min-max range for 2" diameter class for each tree

def minDBH(dbh):
    return int(np.floor(dbh/2)*2)

def maxDBH(dbh):
    # if a tree has an even DBH (e.g., 2.0), don't let maxDBH = minDBH...
    if np.ceil(dbh/2)*2 == np.floor(dbh/2)*2:
        return int(np.ceil(dbh/2)*2 +2)
    else:
        return int(np.ceil(dbh/2)*2)

In [25]:
# calculate the min and max DBHs for all the trees
treelist["minDBH"] = treelist.DIA.apply(minDBH)
treelist["maxDBH"] = treelist.DIA.apply(maxDBH)

In [26]:
# Add a basal area field
treelist["BA_ft2_ac"] = (treelist.DIA ** 2) * 0.005454 * treelist.TPA_UNADJ

In [27]:
# create columns for each variant
# for each tree in the treelist, populate them with with the species code FVS uses for that variant
variants = ['BM', 'CA', 'EC', 'PN', 'SO', 'WC']
for variant in variants:
    treelist[variant] = treelist['COMMON_NAME'].apply(lambda common_name: species_crosswalk[common_name][variant])

In [28]:
# create tree_live treelist by filtering out dead trees (statuscd = 2)
treelive = treelist.loc[treelist.STATUSCD == 1]

In [29]:
treelive.head()


Out[29]:
ACTUALHT BHAGE COMMON_NAME CR DIA GENUS HT PLOT SPCD SPECIES_SYMBOL ... TREE_ID minDBH maxDBH BA_ft2_ac BM CA EC PN SO WC
COND_ID
1150352010901 44 NaN Western redcedar 15 9.5 Thuja 44 71915 242 THPL ... 111 8 10 2.958263 GF RC RC RC RC RC
1150352010901 73 318 Western hemlock 40 32.8 Tsuga 73 71915 263 TSHE ... 112 32 34 5.808955 GF WH WH WH WH WH
1150352010901 66 95 Western hemlock 10 10.5 Tsuga 66 71915 263 TSHE ... 113 10 12 3.613834 GF WH WH WH WH WH
1150352010901 45 NaN Western redcedar 20 10.2 Thuja 45 71915 242 THPL ... 114 10 12 3.410279 GF RC RC RC RC RC
1150352010901 47 NaN Western hemlock 20 9.2 Tsuga 47 71915 263 TSHE ... 115 8 10 2.774376 GF WH WH WH WH WH

5 rows × 26 columns


In [30]:
# Calculate total live TPA of each species within each 2" diameter class

# create a blank dataframe that will hold treelive_summaries for all variants
treelive_summary = pd.DataFrame()

# create a tree_live summary for each variant
for variant in variants:
    # groupby COND_ID, COMMON_NAME, variant (column name for species codes to which the tree is assigned in that variant), DBH class
    # return the sum of TPA in each of these groups
    grouped = pd.DataFrame(treelive.groupby([treelive.index, "PLOT", "COMMON_NAME", variant, "minDBH"])["TPA_UNADJ", "BA_ft2_ac", "HT", "DIA", "TOTAGE"].agg([np.sum, np.mean, 'count'])).reset_index()
    grouped.rename(columns={'COND_ID': 'cond_id'}, inplace=True)
    grouped.set_index("cond_id", inplace=True)
    
    # flatten the column names (groupby returned multi-level column names)
    grouped.columns = ['_'.join(col).strip() if col[1] != '' else col[0] for col in grouped.columns.values]
    
    # add a column for this dataframe that stored which variant it represents
    grouped["variant"] = variant
    
    # create a 'varname' column that concatenates common_name and diameter class
    grouped["varname"] = grouped.COMMON_NAME + "_" + grouped.minDBH.map(str)
    
    # rename the columns with what the IDB_summary schema in Forest Planner is expecting
    grouped.rename(columns={'PLOT': 'plot_id', variant: 'FVS_Spp_Code', 'COMMON_NAME': 'fia_forest_type_name', 
                            'minDBH': 'calc_dbh_class', 'TPA_UNADJ_sum': 'sumoftpa', 'TPA_UNADJ_mean': 'avgoftpa', 
                            'TPA_UNADJ_count': 'calc_tree_count', 'BA_ft2_ac_sum': 'sumofba_ft2_ac', 
                            'BA_ft2_ac_mean': 'avgofba_ft2_ac',  'HT_mean': 'avgofht_ft', 'DIA_mean': 'avgofdbh_in',
                            'TOTAGE_mean': 'avgofage_bh'}, inplace=True)
    
    # Map columns to the data types the TREELIVE_SUMMARY db schema expects (for those not already appropriately formatted)
    # mapping to floats
    for column in ['calc_dbh_class', 'sumoftpa', 'avgoftpa']:
        grouped[column] = grouped[column].map(float)
    grouped.reset_index(inplace = True)
    
    # append the treelive_summary for this variant to the 
    treelive_summary = treelive_summary.append(grouped, ignore_index = True)

In [31]:
treelive_summary.head()


Out[31]:
cond_id plot_id fia_forest_type_name FVS_Spp_Code calc_dbh_class sumoftpa avgoftpa calc_tree_count sumofba_ft2_ac avgofba_ft2_ac ... avgofht_ft HT_count DIA_sum avgofdbh_in DIA_count TOTAGE_sum avgofage_bh TOTAGE_count variant varname
0 1 1097 Bigleaf maple OH 0 49.10 49.10 1 0.002678 0.002678 ... 1 1 0.1 0.1 1 NaN NaN 0 BM Bigleaf maple_0
1 1 1097 Douglas-fir DF 0 49.10 49.10 1 0.385620 0.385620 ... 11 1 1.2 1.2 1 29 29 1 BM Douglas-fir_0
2 1 1097 Douglas-fir DF 2 98.20 49.10 2 8.146214 4.073107 ... 23 2 7.8 3.9 2 68 34 2 BM Douglas-fir_2
3 1 1097 Douglas-fir DF 4 35.63 35.63 1 6.313652 6.313652 ... 41 1 5.7 5.7 1 32 32 1 BM Douglas-fir_4
4 1 1097 Douglas-fir DF 10 9.28 9.28 1 6.462789 6.462789 ... 71 1 11.3 11.3 1 51 51 1 BM Douglas-fir_10

5 rows × 22 columns


In [32]:
treelive_summary.dtypes


Out[32]:
cond_id                   int64
plot_id                   int64
fia_forest_type_name     object
FVS_Spp_Code             object
calc_dbh_class          float64
sumoftpa                float64
avgoftpa                float64
calc_tree_count           int64
sumofba_ft2_ac          float64
avgofba_ft2_ac          float64
BA_ft2_ac_count           int64
HT_sum                  float64
avgofht_ft              float64
HT_count                  int64
DIA_sum                 float64
avgofdbh_in             float64
DIA_count                 int64
TOTAGE_sum              float64
avgofage_bh             float64
TOTAGE_count              int64
variant                  object
varname                  object
dtype: object

In [33]:
# Add columns to treelive_summary to record total BA of that COND_ID, count of species-x-size class in that COND_ID, and 
# the % of total basal area for that COND_ID found in this species-x-size class

# Create a dataframe that has the total basal area and count of species size classes for each COND_ID
cond_summary = pd.DataFrame(treelive_summary.loc[treelive_summary.variant == 'BM'].groupby(['cond_id'])['sumofba_ft2_ac'].agg(['sum', 'count']))
#cond_summary.set_index("cond_id", inplace=True)

# lookup the total BA for each COND_ID in the cond_sumamry dataframe, apply it to every species-x-size class in the treelive_summary
treelive_summary['total_ba_ft2_ac'] = treelive_summary.reset_index()['cond_id'].apply(lambda id: cond_summary.at[id,'sum'])

# lookup the number of species-x-size classes in the cond_sumamry dataframe, apply it to every species-x-size class in the treelive_summary
treelive_summary['count_speciessizeclasses'] = treelive_summary.reset_index()['cond_id'].apply(lambda id: cond_summary.at[id,'count']).map(int)

# Calculate the % of total basal area found in each species-x-size class
treelive_summary['pct_of_totalba'] = treelive_summary['sumofba_ft2_ac']/treelive_summary['total_ba_ft2_ac']

In [34]:
treelive_summary.dtypes


Out[34]:
cond_id                       int64
plot_id                       int64
fia_forest_type_name         object
FVS_Spp_Code                 object
calc_dbh_class              float64
sumoftpa                    float64
avgoftpa                    float64
calc_tree_count               int64
sumofba_ft2_ac              float64
avgofba_ft2_ac              float64
BA_ft2_ac_count               int64
HT_sum                      float64
avgofht_ft                  float64
HT_count                      int64
DIA_sum                     float64
avgofdbh_in                 float64
DIA_count                     int64
TOTAGE_sum                  float64
avgofage_bh                 float64
TOTAGE_count                  int64
variant                      object
varname                      object
total_ba_ft2_ac             float64
count_speciessizeclasses      int64
pct_of_totalba              float64
dtype: object

In [40]:
treelive_summary[['cond_id', 'calc_dbh_class', 'fia_forest_type_name', 'avgofdbh_in', 'avgofht_ft']].loc[(treelive_summary.cond_id == 521) & (treelive_summary.fia_forest_type_name == "Pacific silver fir")]


Out[40]:
cond_id calc_dbh_class fia_forest_type_name avgofdbh_in avgofht_ft
2293 521 0 Pacific silver fir 1.6 7
2294 521 6 Pacific silver fir 7.7 42
644509 521 0 Pacific silver fir 1.6 7
644510 521 6 Pacific silver fir 7.7 42
1286725 521 0 Pacific silver fir 1.6 7
1286726 521 6 Pacific silver fir 7.7 42
1928941 521 0 Pacific silver fir 1.6 7
1928942 521 6 Pacific silver fir 7.7 42
2571157 521 0 Pacific silver fir 1.6 7
2571158 521 6 Pacific silver fir 7.7 42
3213373 521 0 Pacific silver fir 1.6 7
3213374 521 6 Pacific silver fir 7.7 42

In [35]:
# write the treelive_summary to a CSV
cols_to_write = ['cond_id', 'plot_id', 'variant', 'varname','fia_forest_type_name', 'FVS_Spp_Code', 'calc_dbh_class', 'calc_tree_count', 'sumoftpa', 'avgoftpa', 'sumofba_ft2_ac', 'avgofba_ft2_ac', 'avgofht_ft', 'avgofdbh_in', 'avgofage_bh', 'total_ba_ft2_ac', 'count_speciessizeclasses', 'pct_of_totalba']
treelive_summary.to_csv("G:\projects\ForestPlanner_2015\Data\Processed\TREELIVE_SUMMARY_" + time.strftime("%Y-%m-%d") + ".csv", columns = cols_to_write, header = True, index = True, index_label = "class_id", quoting=csv.QUOTE_NONNUMERIC)

print("printed " + str(len(treelive_summary)/len(variants)) + " species-x-dbh classes")
print("from "+ str(len(pd.unique(treelive_summary.cond_id))) + " conditions to:")
print("G:\projects\ForestPlanner_2015\Data\Processed\TREELIVE_SUMMARY_" + time.strftime("%Y-%m-%d") + ".csv")


printed 642216 species-x-dbh classes
from 42815 conditions to:
G:\projects\ForestPlanner_2015\Data\Processed\TREELIVE_SUMMARY_2015-12-15.csv