Here we download the raw usage data for years 2000, 2005, and 2010 from the USGS usage site and synthesize all data into a tidy table listing: year, FIPS code, and total annual withdrawals by sector in MGal/day tagged by sector, source type (surface or groundwater) and water type (fresh or saline). This table is formatted as follows:
| STATE | FIPS | YEAR | UseClass | SourceClass | SourceType | Amount |
|---|---|---|---|---|---|---|
| AL | 01001 | 2000 | PublicSupply | Surface | Fresh | 0.00 |
| AL | 01001 | 2000 | Industrial | Ground | Saline | 0.00 |
This table can then easily be summarized and joined, by YEAR and FIPS code, to other accounting data tables and summarized by state.
In [1]:
#Import modules required for analysis
import os
import pandas as pd
import numpy as np
In [2]:
#Create prefix remapping dictionaries to standardize column names for all years to 2010
remapDict = {'IT-':'IR-', #Irrigated (total)
'LA-':'AQ-', #Aquaculture
'LS-':'LI-', #Livestock
'PE-':'PC-', #Closed-loop thermo electric
'Wtotl':'WTotl' #Capitalization mismatch
}
In [3]:
#Create a function for importing a table and standardizing all field names
def importTable(year, remapDict):
#Set the download URL using with the year
theURL = 'http://water.usgs.gov/watuse/data/{0}/usco{0}.txt'.format(year)
#Download the dataset to a data frame (keeping the FIPS attribute as a string)
df = pd.read_table(theURL,dtype={'FIPS':str})
#Ensure that the FIPS column maintains 5 characters
df['FIPS'] = df['FIPS'].apply(lambda x: str(x).zfill(5))
#Remove the STATEFIPS and COUNTYFIPS columns (as they are not needed)
df.drop("STATEFIPS",axis=1,inplace=True)
df.drop("COUNTYFIPS",axis=1,inplace=True)
#Use the remap dictionary to rename columns
#Get the current column names as a list
colNames = df.columns.values.tolist()
for inFld,outFld in remapDict.items():
#This loops through each item in colNames and replaces it with a revised one
colNames_update = [x.replace(inFld,outFld) for x in colNames]
colNames = colNames_update
#Update the column names in the data frame
df.columns = colNames
#Add year field, if not present
if "YEAR" not in df.columns.values:
df.insert(1,"YEAR",year)
#Remove unnamed columns, which oddly appear in 2005 and 2010 datasets
if "Unnamed" in df.columns.values[-1]:
df.drop(df.columns.values[-1],axis=1,inplace=True)
#Status
print "{} records and {} attributes returned for year {}".format(df.shape[0],df.shape[1],year)
#Return the data frame
return df
In [4]:
#Get the tables
df2000 = importTable(2000,remapDict)
df2005 = importTable(2005,remapDict)
df2010 = importTable(2010,remapDict)
In [5]:
#CHECK: example output with a column for each supply/source/type grouping.
df2000.head()
Out[5]:
In [6]:
#"Melt" data so each use/class/type becomes a new row
df2000m = pd.melt(df2000,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
df2005m = pd.melt(df2005,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
df2010m = pd.melt(df2010,id_vars=['STATE','FIPS','YEAR'],var_name='Class',value_name='Amount')
In [7]:
#CHECK: Observe how the data has been 'melted'
df2000m.head()
Out[7]:
In [8]:
#Merge all records into a single table
dfUse = pd.concat([df2000m, df2005m, df2010m],ignore_index=True)
In [10]:
#CHECK: the dfUse frame should have as many rows as the 2000, 2005, and 2010 data combined
dfUse.shape[0]
Out[10]:
In [11]:
#Create remap dictionaries. These enable extraction of specific sub-attriubutes
# contained in the Class field. For example, PS-WGWFr refers to Fresh groundwater
# withdrawn for Public Supply.
useClassMap = {'TP':'Population',
'PS':'PublicSupply',
'DO':'Domestic',
'IN':'Industrial',
'IR':'Irrigation',
'IC':'Irrigation_Crop',
'IG':'Irrigation_Golf',
'LI':'Livestock',
'AQ':'Aquaculture',
'MI':'Mining',
'PT':'ThermoElec',
'PO':'ThermoElec_OnceThru',
'PC':'ThermoElec_Recirc',
'TO':'Total'
}
srcClassMap = {'-WGW':'Groundwater',
'-WSW':'Surface'
}
srcTypeMap = {'Fr':'Fresh',
'Sa':'Saline'
}
In [12]:
#Set the use classes (PS, DO, LI, etc.)
for code,value in useClassMap.items():
dfUse.loc[dfUse.Class.str.startswith(code), 'UseClass'] = value
In [13]:
#Set the source classes (surface or groundwater)
for code,value in srcClassMap.items():
dfUse.loc[dfUse.Class.str.contains(code), 'SrcClass'] = value
In [14]:
#Set the source types (fresh or saline)
for code,value in srcTypeMap.items():
dfUse.loc[dfUse.Class.str.endswith(code), 'SrcType'] = value
In [22]:
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['UseClass'])].Class.unique()
Out[22]:
In [24]:
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['SrcClass'])].Class.unique()
Out[24]:
In [25]:
#CHECK: List classes with null srcClass remaps
dfUse[pd.isnull(dfUse['SrcType'])].Class.unique()
Out[25]:
In [26]:
#Remove rows (axis = 0), with a null value in 'any' column
dfOutput = dfUse.dropna(axis=0,how='any')
In [27]:
#CHECK: see that new columns were added to decode the Class
dfOutput.head()
Out[27]:
In [28]:
#Set the output location and filename
dataDir = '../../Data'
outFN = 'UsageDataTidy.csv'
dfOutput.to_csv(dataDir + os.sep + outFN,index=False)