Import agate, python based data analysis
In [1]:
import agate
The well database is a big one so I'm going to force agate to take NrdPermit field as a text and PostalCD aka zip code as text also. Finally, limiting the number of rows agate checks helps it to run this code faster as well.
In [2]:
tester = agate.TypeTester(force = {
'NrdPermit': agate.Text(),
'PostalCD': agate.Text()
}, limit=100)
wells = agate.Table.from_csv('wells.csv', column_types=tester)
In [3]:
print(wells)
Since I'm going to find if a well is active during a given year, I wanted to find my min and max values for completion of those wells.
In [4]:
oldest = wells.aggregate(agate.Min('Compdate'))
In [5]:
print(oldest)
In [6]:
newest = wells.aggregate(agate.Max('Compdate'))
In [7]:
print(newest)
So this piece finds whether a well is active or not in a given year, and the code is a custom Python class that inherits various methods from agate.Computation class. Using a custom Computation class allows me to provide the function with multiple inputs as the typical agate.Formula() is limited to one.
In [8]:
class ActiveWell(agate.Computation):
"""
Computes whether a well is active in a year
"""
def __init__(self, column_name_start, column_name_end, year):
self._column_start = column_name_start
self._column_end = column_name_end
self._year = year
def get_computed_data_type(self, table):
"""
The return value is a boolean.
"""
return agate.Boolean()
def validate(self, table):
"""
Verify the columns are dates.
"""
columnStart = table.columns[self._column_start]
if not isinstance(columnStart.data_type, agate.Date):
raise agate.DataTypeError('Can only be applied to date data.')
columnEnd = table.columns[self._column_end]
if not isinstance(columnEnd.data_type, agate.Date):
raise agate.DataTypeError('Can only be applied to date data.')
def run(self, table):
new_column = []
for row in table.rows:
#get the start value from the start column aka the year the well was constructed
start_val = row[self._column_start]
#some of the wells don't have constructed dates so I manually set the nulls at 1700
if start_val is not None:
start_year = start_val.year
else:
start_year = 1700
#same process with the deconstructed well date
end_val = row[self._column_end]
if end_val is not None:
end_year = end_val.year
else:
end_year = 5000
#if the well was constructed before this year and hasn't been deconstructed yet, return as an active well
if start_year < self._year < end_year:
status = True
new_column.append(status)
else:
status = False
new_column.append(status)
return new_column
To check my ActiveWell class, I computed 2010 active wells. After filtering so only the active wells remain, I grouped by each count and then counted its active wells. Also, the total active wells helped me debug.
In [9]:
check_wells = wells.compute([
('2010active', ActiveWell('Compdate', 'DecommDate', 2010))
])
active_wells = check_wells.where(lambda row: row['2010active'] == True)
county_groups = active_wells.group_by('Countyname')
well_counts = county_groups.aggregate([
('count', agate.Count())
])
total = well_counts.aggregate(agate.Sum('count'))
print(total)
print_table = well_counts.select(['Countyname', 'count'])
print_table.print_table(10)
The years in the array were chosen due to the USDA ag census was conducted on those dates. Therefore, like the test 2010 analysis above, this program loops through each year in the array, finds the active wells in that year, groups by each count and counts the number of active wells. Finally, the counties' wells counts are outputted in a csv file.
In [13]:
years = [1997, 2002, 2007, 2012]
for year in years:
print("Begin {0} well analysis".format(year))
year_category = '{0}active'.format(year)
count_category = 'count{0}'.format(year)
check_wells = wells.compute([
(year_category, ActiveWell('Compdate', 'DecommDate', year))
])
active_wells = check_wells.where(lambda row: row[year_category] == True)
county_groups = active_wells.group_by('Countyname')
well_counts = county_groups.aggregate([
('Wells', agate.Count())
])
print_table = well_counts.select(['Countyname', 'Wells'])
print_table.print_table(10)
total = well_counts.aggregate(agate.Sum('Wells'))
print("Total active wells in {0} is {1}".format(year, total))
well_counts.to_csv('wells{0}.csv'.format(year))
In [ ]: