In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import os
import os.path

In [2]:
working_dir = os.getcwd()
sub_dir = 'ozone'

#path for each ozone file
file_dir = os.path.join(working_dir, sub_dir)
ozone_dir = [file_dir + "/" + filename for filename in os.listdir(file_dir)]

In [4]:
def create_df(path):
    """
    Given the file path, it creates dataframe
    Seperation in file must be "|"
    
    input: a file path
    output: a data frame
    
    """
    #read in the file
    ozone_hourly = pd.read_table(path, sep = '|')
    
    #pandas automatically skips the NaN values while taking the mean.
    ozone_daily = pd.DataFrame(ozone_hourly.groupby(['site', 'date'])['obs'].mean())
    ozone_daily.reset_index(inplace = True)
    
    return ozone_daily
                   

def convert_to_daily(data_directory):
    """
    Given the data directory with hourly data,
    it converts observations into daily data
    
    input: data directory for all files
    output: one data frame for all files
    """
    
    for index, path in enumerate(data_directory):
        
        if index == 0:
            ozone_daily = create_df(path)
        else:
            merge_to = create_df(path)
            ozone_daily = ozone_daily.append(merge_to)
    return ozone_daily

In [5]:
ozone_daily = convert_to_daily(ozone_dir)

Although these two sites are listed in the Location file they are not found in the 'ozone' data set. Check out, "MSA name" column in the Location.xlxs. I think they are not used for monitoring air quality parameters.


In [24]:
ozone_daily['site'][ozone_daily['site'].isin([2778, 2783])]


Out[24]:
Series([], Name: site, dtype: int64)

The number of unique sites in Location file is around 2100. However, as you can see below, this number is 485 in the "ozone" data set.


In [25]:
len(ozone_daily['site'].unique())


Out[25]:
485

In [39]:
#this was done once in the beginning to
#save data set as ".csv" file.
#ozone_daily.to_csv('daily_ozone_obs_1980_2014.csv', sep = ',')

In [26]:
locations = pd.read_excel('Location.xlsx')

In [11]:
def get_county_site(locations, county = 'Colusa'):
    
    county_of_interest = (locations.set_index(['County Name', 'Site']).loc[county])
    county_of_interest = county_of_interest.reset_index()
    county_sites = county_of_interest['Site']
    return county_sites

In [35]:
colusa_sites = get_county_site(locations).dropna()
colusa_daily_ozone = ozone_daily[ozone_daily['site'].isin(colusa_sites)]
colusa_daily_ozone = (colusa_daily_ozone.reset_index().
                      drop('index', axis = 1))

In [38]:
#this also was done only once to save the output as csv file.
colusa_daily_ozone.to_csv('colusa_daily_ozone_1980_2014.csv', sep = ',')

In [36]:
colusa_daily_ozone.head()


Out[36]:
site date obs
0 2109 1980-09-12 0.047273
1 2109 1980-09-13 0.025833
2 2109 1980-09-14 0.030833
3 2109 1980-09-15 0.030417
4 2109 1980-09-16 0.042500

In [37]:
colusa_daily_ozone['site'].unique()


Out[37]:
array([2109, 2395, 2744])

So, three "Colusa" sites have been constantly collecting "ozone" observations since 1980.