Wrangling Delhi Pollution Data

Author: Avikal Somvanshi


In [1]:
from __future__ import print_function, division
import pandas as pd
import pylab as pl
import os
import csv
from pandas.tools.plotting import scatter_matrix
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
%pylab inline

print (os.getenv("PUIDATA"))

The data was downloaded from the website of Central Pollution Control Board of India (Indian equalvalent of EPA).

The website allows to download only excel files one parameter and three months at a time.

The data consist of hourly measurements of three pollutants and four weather elements from the RK Puram Pollution Monitoring Station of Delhi Government located in Southern Delhi from April 2015 to November 2016.

Data is spread across 49 excel files. All the excel files are stored in a zip file and is placed in project folder. I'm wrangling and writing them on to a csv to be used for further analysis.


In [2]:
os.system("unzip " + "Delhi_Pollution_RKPuram.zip")


Out[2]:
0

In [3]:
#Reading and saving downloaded PM2.5 data in a dataframe

rk_pm = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_pm_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'PM2.5_Concentration(microgram/m3)'}, inplace=True)
    rk_pm = pd.concat([rk_pm, data])
    
len(rk_pm)


Out[3]:
12424

In [4]:
#Reading and saving downloaded Ozone data in a dataframe

rk_o3 = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_o3_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'Ozone_Concentration(microgram/m3)'}, inplace=True)
    rk_o3 = pd.concat([rk_o3, data])
    
len(rk_o3)


Out[4]:
12773

In [5]:
#Reading and saving downloaded NOx data in a dataframe

rk_nox = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_nox_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'NOx_Concentration(ppb)'}, inplace=True)
    rk_nox = pd.concat([rk_nox, data])
    
len(rk_nox)


Out[5]:
12781

In [6]:
#Reading and saving downloaded temperature data in a dataframe

rk_at = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_at_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'Temperature(Celsius)'}, inplace=True)
    rk_at = pd.concat([rk_at, data])
    
len(rk_at)


Out[6]:
12964

In [7]:
#Reading and saving downloaded Relative Humidity data in a dataframe

rk_rh = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_rh_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'Relative_Humidity(%)'}, inplace=True)
    rk_rh = pd.concat([rk_rh, data])
    
len(rk_rh)


Out[7]:
12976

In [8]:
#Reading and saving downloaded wind speed data in a dataframe

rk_ws = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_ws_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'Wind_Speed(m/s)'}, inplace=True)
    rk_ws = pd.concat([rk_ws, data])
    
len(rk_ws)


Out[8]:
12976

In [9]:
#Reading and saving downloaded solar radiation data in a dataframe

rk_sr = pd.DataFrame()

for i in range (1,8):
    data = pd.read_excel('Delhi_Pollution_RKPuram/CPCBAirQualityDataReport_20-11-2016_sr_0' + str(i) + '.xlsx', header=16)
    data = data[:-2]
    data = data[['From (Time)', 'To', 'Date', 'Concentration']]
    data.rename(columns={'Concentration':'Solar_Radiation(w/m2)'}, inplace=True)
    rk_sr = pd.concat([rk_sr, data])
    
len(rk_sr)


Out[9]:
12964

In [10]:
#Merging all the dataframes to a mother dataframe

rk = pd.merge(rk_pm, rk_o3, on=['From (Time)', 'To', 'Date'])
rk = pd.merge(rk, rk_nox, on=['From (Time)', 'To', 'Date'])
rk = pd.merge(rk, rk_at, on=['From (Time)', 'To', 'Date'])
rk = pd.merge(rk, rk_ws, on=['From (Time)', 'To', 'Date'])
rk = pd.merge(rk, rk_rh, on=['From (Time)', 'To', 'Date'])
rk = pd.merge(rk, rk_sr, on=['From (Time)', 'To', 'Date'])
rk.head()


Out[10]:
From (Time) To Date PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2)
0 NaN (Time) NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN (Time) NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN (Time) NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN (Time) NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN (Time) NaN NaN NaN NaN NaN NaN NaN NaN

In [11]:
#counting size of mother dataset
len(rk)


Out[11]:
855465

In [12]:
#The mother dataset is almost 80 times the size of individual sub-dataset. 
#Quick examination of the dataset reveals that there are multiple duplicates
#Removing duplicates via groupby

pollution_Delhi1 = rk.groupby(['From (Time)', 'To', 'Date']).mean()
pollution_Delhi1.reset_index(inplace=True)
pollution_Delhi1.describe()


Out[12]:
PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2)
count 11693.000000 11782.000000 11855.000000 11856.000000 11856.000000 11856.000000 11856.000000
mean 127.671341 51.893113 168.641692 25.245209 1.214934 54.381272 112.831123
std 115.125828 61.026988 211.457149 8.119243 0.879161 20.365264 144.420861
min 10.170000 1.030000 6.600000 1.660000 0.300000 7.000000 1.180000
25% NaN NaN NaN 20.517500 0.380000 37.920000 3.250000
50% NaN NaN NaN 26.120000 1.030000 56.080000 11.540000
75% NaN NaN NaN 30.850000 1.810000 71.670000 246.190000
max 985.000000 492.520000 1532.820000 44.680000 7.180000 88.130000 420.830000

In [13]:
pollution_Delhi1.head()


Out[13]:
From (Time) To Date PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2)
0 00:00:00 00:59:00 01/01/2016 445.00 11.22 536.23 7.72 0.32 75.75 3.00
1 00:00:00 00:59:00 01/02/2016 193.00 7.96 139.68 9.95 0.49 77.92 2.92
2 00:00:00 00:59:00 01/03/2016 229.83 12.87 916.78 15.27 0.30 80.00 3.17
3 00:00:00 00:59:00 01/05/2015 76.83 56.25 58.68 27.89 1.92 49.58 3.17
4 00:00:00 00:59:00 01/06/2015 67.00 25.74 68.74 28.31 1.12 30.75 3.25

In [14]:
#Making a datetime index

pollution_Delhi1['datetime'] = pd.to_datetime(pollution_Delhi1['From (Time)'] + " " + pollution_Delhi1['Date'], dayfirst=True)
pollution_Delhi1.index = pd.DatetimeIndex(pollution_Delhi1.datetime)
pollution_Delhi1.head(10)


Out[14]:
From (Time) To Date PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2) datetime
2016-01-01 00:00:00 00:59:00 01/01/2016 445.00 11.22 536.23 7.72 0.32 75.75 3.00 2016-01-01
2016-02-01 00:00:00 00:59:00 01/02/2016 193.00 7.96 139.68 9.95 0.49 77.92 2.92 2016-02-01
2016-03-01 00:00:00 00:59:00 01/03/2016 229.83 12.87 916.78 15.27 0.30 80.00 3.17 2016-03-01
2015-05-01 00:00:00 00:59:00 01/05/2015 76.83 56.25 58.68 27.89 1.92 49.58 3.17 2015-05-01
2015-06-01 00:00:00 00:59:00 01/06/2015 67.00 25.74 68.74 28.31 1.12 30.75 3.25 2015-06-01
2016-06-01 00:00:00 00:59:00 01/06/2016 43.33 14.95 70.67 26.43 0.89 47.75 3.00 2016-06-01
2015-07-01 00:00:00 00:59:00 01/07/2015 55.00 13.78 142.87 25.52 0.73 69.92 3.92 2015-07-01
2016-07-01 00:00:00 00:59:00 01/07/2016 54.17 20.43 24.77 29.21 1.23 71.33 3.00 2016-07-01
2015-08-01 00:00:00 00:59:00 01/08/2015 23.17 16.47 68.52 24.84 2.07 72.25 4.00 2015-08-01
2015-09-01 00:00:00 00:59:00 01/09/2015 53.50 7.92 80.04 27.42 0.63 64.92 3.42 2015-09-01

In [15]:
#dropping unwanted columns and sorting index

pollution_Delhi1.drop(['From (Time)', 'Date', 'datetime', 'To'], 1, inplace=True)
pollution_Delhi1.sort_index(ascending=True, inplace=True)
pollution_Delhi1.head()


Out[15]:
PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2)
2015-04-04 10:00:00 58.22 37.23 44.67 17.57 1.88 78.33 332.44
2015-04-04 11:00:00 37.67 53.48 43.51 20.02 1.72 65.00 333.83
2015-04-04 12:00:00 36.00 96.23 32.07 20.99 1.30 58.17 311.33
2015-04-04 14:00:00 32.00 75.14 28.03 23.53 1.86 49.92 374.25
2015-04-04 15:00:00 32.17 69.67 32.95 22.97 1.91 49.67 316.75

In [16]:
#Checking dataset

scatter_matrix(pollution_Delhi1, s=300, figsize=(16, 16))


Out[16]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fba0e984c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0e6a9990>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0e019750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0bf5b5d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0bf3f610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0ab19710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0aa9c590>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba0aa095d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a98c790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a8f1650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a871ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a91dd10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a7e2610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a764710>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a6c6f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a649d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a5afb50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a532ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a53fe90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a4a4950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a4277d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a38a910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a30b790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a276490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a1fe590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a181410>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a14cb50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a0cf810>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba0a03f6d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09fcc610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09f4e590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09eb3750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09e36790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09e19710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09d9c810>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba09dbdd90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09c8d710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09c11590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09b74b90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09af6a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09adba50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba09a5d9d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fba099ece50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba0994e790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba098d2610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba098368d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba097b9750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba097a1450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fba096ab550>]], dtype=object)

Caption: All columns are plotable.


In [17]:
#Saving the dataset as csv

pollution_Delhi1.to_csv('Delhi_pollution_data_RK.csv')

In [18]:
#Checking the CSV

data = pd.read_csv('Delhi_pollution_data_RK.csv', parse_dates=[0], infer_datetime_format=True, dayfirst=True)
data.head()


Out[18]:
Unnamed: 0 PM2.5_Concentration(microgram/m3) Ozone_Concentration(microgram/m3) NOx_Concentration(ppb) Temperature(Celsius) Wind_Speed(m/s) Relative_Humidity(%) Solar_Radiation(w/m2)
0 2015-04-04 10:00:00 58.22 37.23 44.67 17.57 1.88 78.33 332.44
1 2015-04-04 11:00:00 37.67 53.48 43.51 20.02 1.72 65.00 333.83
2 2015-04-04 12:00:00 36.00 96.23 32.07 20.99 1.30 58.17 311.33
3 2015-04-04 14:00:00 32.00 75.14 28.03 23.53 1.86 49.92 374.25
4 2015-04-04 15:00:00 32.17 69.67 32.95 22.97 1.91 49.67 316.75

Good to go!


In [ ]: