In [80]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import os
import os.path

In [81]:
working_dir = os.getcwd()
sub_dir = 'pm25'

#path for each pm file
file_dir = os.path.join(working_dir, sub_dir)
pm_dir = [file_dir + "/" + filename for filename in os.listdir(file_dir)]

In [88]:
pm = pd.read_csv(pm_dir[0])
print (pm.columns)

#Delete unnessary columns: Monitor, StarHour, AppliesToNat1,
#AppliesToSt, #CollectionMethod, #QuantificationMethod
cols = pm.columns
pm.drop(cols[[1, 3, 5, 6, 7, 8]], axis = 1, inplace=True)


Index(['Site', 'Monitor', 'Date', 'StartHour', 'Value', 'AppliesToNatl',
       'AppliesToSt', 'CollectionMethod', 'QuantificationMethod'],
      dtype='object')

In [83]:
locations = pd.read_excel('Location.xlsx')

In [84]:
def get_county_site(locations, county = 'Colusa'):
    """
    Given locations DF it returns all sites
    found in the input county.
    Default county = 'Colusa'
    """
    
    county_of_interest = (locations.set_index(['County Name', 'Site']).loc[county])
    county_of_interest = county_of_interest.reset_index()
    county_sites = county_of_interest['Site']
    return county_sites

In [89]:
colusa_sites = get_county_site(locations, 'Colusa')
colusa_pm = pm[pm['Site'].isin(colusa_sites)]

In [90]:
#save the output
#colusa_pm.to_csv('PM25_weekly_98_14', sep = ",")

Although these two sites are listed in the Location file they are not found in the 'pm' data set. Check out, "MSA name" column in the Location.xlxs. I think they are not used for monitoring air quality parameters.

The number of unique sites in Location file is around 2100. However, as you can see below, this number is 485 in the "pm" data set.


In [ ]: