In [35]:
import pandas as pd
import numpy as np
columns = ["State","State_ANSI", "Ag_District", "Ag_District_Code", "County",
"County_ANSI","Commodity","Data_Item", "Domain", "Domain_Category", "Value", "CV", "FIPS"]
path = 'berries_by_county.csv'
berry_df = pd.read_csv(path, names=columns, skiprows=1)
In [36]:
# make strings to concatenate into FIPS codes
berry_df.State_ANSI = berry_df.State_ANSI.astype(str)
berry_df.County_ANSI = berry_df.County_ANSI.astype(str)
# more readable in title case than all caps
berry_df.State = berry_df.apply(lambda x: x.State.title(), axis = 1)
berry_df.County = berry_df.apply(lambda x: x.County.title(), axis = 1)
berry_df.Commodity = berry_df.apply(lambda x: x.Commodity.title(), axis = 1)
In [37]:
def prepend_zeros(row):
if len(row.County_ANSI) == 1:
return ''.join(['00', row.County_ANSI])
elif len(row.County_ANSI) == 2:
return ''.join(['0', row.County_ANSI])
else:
return row.County_ANSI
berry_df.County_ANSI = berry_df.apply(prepend_zeros, axis=1)
In [38]:
# index by combined ids
berry_df.FIPS = berry_df.apply(lambda row: ''.join([row.State_ANSI, row.County_ANSI]), axis = 1)
berry_df.head()
Out[38]:
In [62]:
berry_df2 = berry_df.Data_Item.apply(lambda x: pd.Series(x.split(' - ')))
berry_df2.columns = ['Spec_Commodity', 'Measure']
berry_df2.head()
Out[62]:
In [63]:
berry_df3 = pd.merge(berry_df, berry_df2, left_index= True, right_index= True)
In [64]:
berry_df3.head()
Out[64]:
In [65]:
# isolate Value column for rows that correspond to fresh market acres harvested, create boolean mask
test_series = berry_df3[berry_df3.Measure == "ACRES GROWN"].Value.str.contains("(D)")
# proportion of True/Total
float(np.sum(test_series))/len(test_series)
Out[65]:
From USDA Census Report, Appendix A:
DISCLOSURE REVIEW
After tabulation and review of the aggregates, a comprehensive disclosure review was conducted. NASS is obligated to withhold, under Title 7, U.S. Code, any total that would reveal an individual’s information or allow it to be closely estimated by the public. Cell suppression was used to protect the cells that were determined to be sensitive to a disclosure of information. Farm counts are not considered sensitive and are not subject to disclosure controls.
Based on agency standards, data cells were determined to be sensitive to a disclosure of information if they violated either of two criteria rules. The threshold rule was violated if the data cell contained less than three operations. For example, if only one farmer produced turkeys in a county, NASS could not publish the county total for turkey inventory without disclosing that individual’s information. The dominance rule was violated if the distribution of the data within the cell allowed a data user to estimate any respondent’s data too closely. For example, if there are many farmers producing turkeys in a county and some of them were large enough to dominate the cell total, NASS could not publish the county total for turkey inventory without risking disclosing an individual respondent’s data. In both of these situations, the data were suppressed and a “(D)” was placed in the cell in the census publication table. These data cells were referred to as primary suppressions.
In [43]:
# IF WANT TO USE ACRES HARVESTED: replace null code Values (such as '(D)','(Z)', etc) with 0
#berry_df3.Value = berry_df3.Value.replace(to_replace = r'([A-Z+])', value = 0, regex=True)
In [66]:
berry_df4 = berry_df3[(berry_df3.Measure == "OPERATIONS WITH AREA GROWN") & (berry_df3.Spec_Commodity != "BERRY TOTALS, IRRIGATED")]
berry_df4.head()
Out[66]:
In [67]:
# replace null values with 0
berry_df4.Value = berry_df4.Value.fillna(0)
berry_df4.Value = berry_df4.Value.replace(to_replace = ',', value = '', regex=True)
In [68]:
berry_df4.Value = berry_df4.Value.astype(int)
berry_df4.FIPS = berry_df4.FIPS.astype(int)
In [69]:
berry_df4.head()
Out[69]:
In [70]:
# check out California berrygies
berry_df4[berry_df4.State == 'California'].head(50)
Out[70]:
In [71]:
# Create pivot table
berry_pt = pd.pivot_table(berry_df4, values = 'Value', rows = 'FIPS', cols= 'Commodity', aggfunc = np.sum)
Lots of NaN values in the pivot table. Will convert to zeros.
In [72]:
berry_pt.head()
Out[72]:
In [73]:
berry_pt = berry_pt.fillna(0)
berry_pt.head()
Out[73]:
In [75]:
berry_pt.to_csv('berries.csv')