In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import os
import os.path
In [2]:
working_dir = os.getcwd()
sub_dir = 'ozone'
#path for each ozone file
file_dir = os.path.join(working_dir, sub_dir)
ozone_dir = [file_dir + "/" + filename for filename in os.listdir(file_dir)]
In [4]:
def create_df(path):
"""
Given the file path, it creates dataframe
Seperation in file must be "|"
input: a file path
output: a data frame
"""
#read in the file
ozone_hourly = pd.read_table(path, sep = '|')
#pandas automatically skips the NaN values while taking the mean.
ozone_daily = pd.DataFrame(ozone_hourly.groupby(['site', 'date'])['obs'].mean())
ozone_daily.reset_index(inplace = True)
return ozone_daily
def convert_to_daily(data_directory):
"""
Given the data directory with hourly data,
it converts observations into daily data
input: data directory for all files
output: one data frame for all files
"""
for index, path in enumerate(data_directory):
if index == 0:
ozone_daily = create_df(path)
else:
merge_to = create_df(path)
ozone_daily = ozone_daily.append(merge_to)
return ozone_daily
In [5]:
ozone_daily = convert_to_daily(ozone_dir)
Although these two sites are listed in the Location file they are not found in the 'ozone' data set. Check out, "MSA name" column in the Location.xlxs. I think they are not used for monitoring air quality parameters.
In [24]:
ozone_daily['site'][ozone_daily['site'].isin([2778, 2783])]
Out[24]:
The number of unique sites in Location file is around 2100. However, as you can see below, this number is 485 in the "ozone" data set.
In [25]:
len(ozone_daily['site'].unique())
Out[25]:
In [39]:
#this was done once in the beginning to
#save data set as ".csv" file.
#ozone_daily.to_csv('daily_ozone_obs_1980_2014.csv', sep = ',')
In [26]:
locations = pd.read_excel('Location.xlsx')
In [11]:
def get_county_site(locations, county = 'Colusa'):
county_of_interest = (locations.set_index(['County Name', 'Site']).loc[county])
county_of_interest = county_of_interest.reset_index()
county_sites = county_of_interest['Site']
return county_sites
In [35]:
colusa_sites = get_county_site(locations).dropna()
colusa_daily_ozone = ozone_daily[ozone_daily['site'].isin(colusa_sites)]
colusa_daily_ozone = (colusa_daily_ozone.reset_index().
drop('index', axis = 1))
In [38]:
#this also was done only once to save the output as csv file.
colusa_daily_ozone.to_csv('colusa_daily_ozone_1980_2014.csv', sep = ',')
In [36]:
colusa_daily_ozone.head()
Out[36]:
In [37]:
colusa_daily_ozone['site'].unique()
Out[37]:
So, three "Colusa" sites have been constantly collecting "ozone" observations since 1980.