First line if magic enabling matplotlib inline plots
In [1]:
%matplotlib inline
Then we have round of inports:
In [2]:
import dask.dataframe as dd
import pandas as pd
import glob
import os
In [3]:
filenames = [ os.path.splitext(wholeFilename)[0] for wholeFilename in
[ os.path.basename(wholePath) for wholePath in glob.glob("../input/2*.xlsx") ] ]
In [4]:
dataFiles = pd.DataFrame({"filename": filenames})
dataFiles["year"], dataFiles["pollutant"], dataFiles["resolution"] = dataFiles["filename"].str.split('_', 2).str
In [5]:
dataFiles.head()
Out[5]:
In [6]:
dataFiles["year"].value_counts()
Out[6]:
In [7]:
dataFiles["pollutant"].value_counts()
Out[7]:
In [8]:
dataFiles["resolution"].value_counts()
Out[8]:
In [9]:
dataFiles.ix[dataFiles["resolution"] == "(PM2.5)-24g", 'pollutant'] = "Cl_(PM2.5)"
dataFiles.ix[dataFiles["resolution"] == "(PM2.5)-24g", 'resolution'] = "24g"
In [10]:
dataFiles.ix[dataFiles["resolution"] == "(PM2.5)_24g", 'pollutant'] = "SO42_(PM2.5)"
dataFiles.ix[dataFiles["resolution"] == "(PM2.5)_24g", 'resolution'] = "24g"
In [11]:
dataFiles.ix[dataFiles["resolution"] == "w_PM25_24g", 'pollutant'] = "Jony_w_PM25"
dataFiles.ix[dataFiles["resolution"] == "w_PM25_24g", 'resolution'] = "24g"
Now resolution column should be correct:
In [12]:
dataFiles["resolution"].value_counts()
Out[12]:
Lets also fix worngly labelled pollutant:
In [13]:
dataFiles.ix[dataFiles["pollutant"] == "PM25", 'pollutant'] = "PM2.5"
In [14]:
dataFiles.describe()
Out[14]:
There is still one empty cell in resolution column. Lets identify it:
In [15]:
dataFiles[dataFiles["resolution"].isnull()]
Out[15]:
After manually examinign 2015_depozycja file I found that it cointains new type of data, which will be useless in planned analysis. I decided to remove it from working memory.
In [16]:
dataFiles.drop(dataFiles[dataFiles["filename"] == "2015_depozycja"].index, inplace=True)
In [17]:
dataFiles.describe()
Out[17]:
In [18]:
importantPollutants = ["PM10", "PM2.5", "O3", "NO2", "SO2", "C6H6", "CO"]
pollutants2015 = dataFiles[(dataFiles["year"] == "2015") & (dataFiles["resolution"] == "1g") &
(dataFiles["pollutant"].isin(importantPollutants))]
In [19]:
pollutants2015
Out[19]:
In [20]:
from tqdm import tqdm
In [21]:
from collections import Counter
In [22]:
#worstStation = {}
#for index, dataRow in tqdm(pollutants2015.iterrows(), total=len(pollutants2015.index)):
# dataFromFile = pd.read_excel("../input/" + dataRow["filename"] + ".xlsx", skiprows=[1,2])
# dataFromFile = dataFromFile.rename(columns={"Kod stacji":"Godzina"})
# dataFromFile = dataFromFile.set_index("Godzina")
# worstStation[dataRow["pollutant"]] = dataFromFile.max().sort_values(ascending = False).index[0]
In [23]:
pollutants = importantPollutants
years = sorted(list(dataFiles["year"].unique()))
In [ ]:
pollutantsYears = dataFiles[(dataFiles["year"].isin(years)) & (dataFiles["resolution"] == "1g") &
(dataFiles["pollutant"].isin(pollutants))]
In [ ]:
bigDataFrame = pd.DataFrame()
for dataYear in years:
print(dataYear)
yearDataFrame = pd.DataFrame()
for index, dataRow in tqdm(pollutantsYears[pollutantsYears["year"] == dataYear].iterrows(), total=len(pollutantsYears[pollutantsYears["year"] == dataYear].index)):
data = pd.read_excel("../input/" + dataRow["filename"] + ".xlsx", skiprows=[1,2])
data = data.rename(columns={"Kod stacji":"Hour"})
year = int(dataRow["year"])
rng = pd.date_range(start = str(year) + '-01-01 01:00:00', end = str(year+1) + '-01-01 00:00:00', freq='H')
# workaround for 2006_PM2.5_1g, 2012_PM10_1g, 2012_O3_1g
try:
data["Hour"] = rng
except ValueError:
print("File {} has some mess with timestamps".format(dataRow["filename"]))
continue
data = data.set_index("Hour")
data = data.stack()
data = pd.DataFrame(data, columns=[dataRow["pollutant"]])
data.index.set_names(['Hour', 'Station'], inplace=True)
yearDataFrame = pd.concat([yearDataFrame, data], axis=1)
bigDataFrame = bigDataFrame.append(yearDataFrame)
In [ ]:
bigDataFrame.to_pickle("../output/bigDataFrame.pkl")
In [ ]:
bigDataFrame
Out[ ]:
In [ ]:
daskBigDataFrame = dd.from_pandas(bigDataFrame)