In [1]:

    
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import os
import os.path



In [2]:

    
working_dir = os.getcwd()
sub_dir = 'ozone'

#path for each ozone file
file_dir = os.path.join(working_dir, sub_dir)
ozone_dir = [file_dir + "/" + filename for filename in os.listdir(file_dir)]



In [4]:

    
def create_df(path):
    """
    Given the file path, it creates dataframe
    Seperation in file must be "|"
    
    input: a file path
    output: a data frame
    
    """
    #read in the file
    ozone_hourly = pd.read_table(path, sep = '|')
    
    #pandas automatically skips the NaN values while taking the mean.
    ozone_daily = pd.DataFrame(ozone_hourly.groupby(['site', 'date'])['obs'].mean())
    ozone_daily.reset_index(inplace = True)
    
    return ozone_daily
                   

def convert_to_daily(data_directory):
    """
    Given the data directory with hourly data,
    it converts observations into daily data
    
    input: data directory for all files
    output: one data frame for all files
    """
    
    for index, path in enumerate(data_directory):
        
        if index == 0:
            ozone_daily = create_df(path)
        else:
            merge_to = create_df(path)
            ozone_daily = ozone_daily.append(merge_to)
    return ozone_daily



In [5]:

    
ozone_daily = convert_to_daily(ozone_dir)

Although these two sites are listed in the Location file they are not found in the 'ozone' data set. Check out, "MSA name" column in the Location.xlxs. I think they are not used for monitoring air quality parameters.



In [24]:

    
ozone_daily['site'][ozone_daily['site'].isin([2778, 2783])]









    Out[24]:





Series([], Name: site, dtype: int64)

The number of unique sites in Location file is around 2100. However, as you can see below, this number is 485 in the "ozone" data set.



In [25]:

    
len(ozone_daily['site'].unique())









    Out[25]:





485



In [39]:

    
#this was done once in the beginning to
#save data set as ".csv" file.
#ozone_daily.to_csv('daily_ozone_obs_1980_2014.csv', sep = ',')



In [26]:

    
locations = pd.read_excel('Location.xlsx')



In [11]:

    
def get_county_site(locations, county = 'Colusa'):
    
    county_of_interest = (locations.set_index(['County Name', 'Site']).loc[county])
    county_of_interest = county_of_interest.reset_index()
    county_sites = county_of_interest['Site']
    return county_sites



In [35]:

    
colusa_sites = get_county_site(locations).dropna()
colusa_daily_ozone = ozone_daily[ozone_daily['site'].isin(colusa_sites)]
colusa_daily_ozone = (colusa_daily_ozone.reset_index().
                      drop('index', axis = 1))



In [38]:

    
#this also was done only once to save the output as csv file.
colusa_daily_ozone.to_csv('colusa_daily_ozone_1980_2014.csv', sep = ',')



In [36]:

    
colusa_daily_ozone.head()



In [37]:

    
colusa_daily_ozone['site'].unique()









    Out[37]:





array([2109, 2395, 2744])

So, three "Colusa" sites have been constantly collecting "ozone" observations since 1980.

	site	date	obs
0	2109	1980-09-12	0.047273
1	2109	1980-09-13	0.025833
2	2109	1980-09-14	0.030833
3	2109	1980-09-15	0.030417
4	2109	1980-09-16	0.042500