In [ ]:
# Import the libraries we need
from os import getcwd, listdir
from os.path import abspath, dirname, isfile, join, splitext
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
# Get the output directory for the csv file
output_folder_name = 'q4_pollutants_over_time' # TODO: Update this if needed
output_dir = abspath(join(getcwd(), '..', 'data', output_folder_name))
In [ ]:
# Create a list of csv files in the output directory
csv_files = [join(output_dir, f) for f in listdir(output_dir) if isfile(join(output_dir, f)) and
splitext(join(output_dir, f))[1] == '.csv']
print("{} csv files found".format(len(csv_files)))
In [ ]:
# Create a single csv file from the output csv files
output_file = join(output_dir, 'q4_output.csv')
with open(output_file, 'w') as o_file:
for num in range(0, len(csv_files)):
with open(csv_files[num], 'r') as f:
if num == 0:
lines = f.readlines()
else:
lines = f.readlines()[1:]
for line in lines:
o_file.write(line)
In [ ]:
# Create a Pandas DataFrame from the csv file
df = pd.read_csv(output_file,
names=['parameter_name', 'state', 'year', 'month', 'arithmetic_mean'],
header=0,
low_memory=False)
df.head()
In [ ]:
# Get the row and column counts
rows_cols = df.shape
print("Rows: {}".format(rows_cols[0]))
print("Columns: {}".format(rows_cols[1]))
In [ ]:
# Describe the dataframe
df.describe()
In [ ]:
# Check the data types
df.dtypes
In [ ]:
# Create a dataframe for California
state_for_analysis = 'California'
cali_df = df.loc[df['state'] == state_for_analysis]
cali_df.head()
In [ ]:
# How many parameters are measured in California?
cali_measured_params = cali_df.parameter_name.unique()
print(len(cali_measured_params))
In [ ]:
# Count of occurrences of each parameter
cali_df.groupby('parameter_name')['state'].count()
In [ ]:
# Verify the ccurrence count of 1122-Tetrachloroethane in the Cali dataframe
df2 = cali_df.loc[cali_df['parameter_name'] == '1122-Tetrachloroethane']
len(df2)
In [ ]:
# Add a date column to the dataframe by combining the month and year columns
pd.options.mode.chained_assignment = None
def create_datetime(year, month):
return datetime.datetime(year=year, month=month, day=1)
cali_df['observation_date'] = cali_df.apply(lambda x: create_datetime(x['year'], x['month']), axis=1)
cali_df.head()
In [ ]:
# Drop the state, year and month columns
cali_df.drop('state', axis=1, inplace=True)
cali_df.drop('year', axis=1, inplace=True)
cali_df.drop('month', axis=1, inplace=True)
# Set the observation_date column as the index to we can create some charty goodness
cali_df.set_index('observation_date', inplace=True)
cali_df.head()
In [ ]:
# Chart this bad boy!
parameter_to_chart = '1122-Tetrachloroethane'
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].plot()
In [ ]:
# Want a histogram?! Okay!
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].hist(alpha=0.5)
In [ ]:
# MOAR CHARTS!
parameter_to_chart = 'Tetrachloroethylene'
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].plot()
In [ ]: