In [2]:
import pandas as pd
import numpy as np
In [2]:
%matplotlib inline
In [37]:
if 'bigDataFrame' in globals():
print("Exist, do nothing!")
else:
print("Read data.")
bigDataFrame = pd.read_pickle("../output/bigDataFrame.pkl")
bigDataFrame.rename(columns={"PM2.5": "PM25"}, inplace=True)
In [4]:
bigDataFrame.head()
Out[4]:
In [5]:
pollutedPlaces = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].idxmax()
print(pollutedPlaces)
pollutedPlaces = set([x[1] for x in pollutedPlaces])
In [6]:
pollutedPlaces
Out[6]:
In [7]:
#reducedDataFrame = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].loc[(slice(None),pollutedPlaces), :]
In [8]:
reducedDataFrame = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].loc[(slice(None), slice(None)), :]
In [75]:
hours = len(reducedDataFrame.index.get_level_values("Hour").unique())
In [76]:
def C6H6qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 5.0):
return "1 Very good"
elif (value > 5.0 and value <= 10.0):
return "2 Good"
elif (value > 10.0 and value <= 15.0):
return "3 Moderate"
elif (value > 15.0 and value <= 20.0):
return "4 Sufficient"
elif (value > 20.0 and value <= 50.0):
return "5 Bad"
elif (value > 50.0):
return "6 Very bad"
else:
return value
def COqual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 2.0):
return "1 Very good"
elif (value > 2.0 and value <= 6.0):
return "2 Good"
elif (value > 6.0 and value <= 10.0):
return "3 Moderate"
elif (value > 10.0 and value <= 14.0):
return "4 Sufficient"
elif (value > 14.0 and value <= 20.0):
return "5 Bad"
elif (value > 20.0):
return "6 Very bad"
else:
return value
def NO2qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 40.0):
return "1 Very good"
elif (value > 40.0 and value <= 100.0):
return "2 Good"
elif (value > 100.0 and value <= 150.0):
return "3 Moderate"
elif (value > 150.0 and value <= 200.0):
return "4 Sufficient"
elif (value > 200.0 and value <= 400.0):
return "5 Bad"
elif (value > 400.0):
return "6 Very bad"
else:
return value
def O3qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 30.0):
return "1 Very good"
elif (value > 30.0 and value <= 70.0):
return "2 Good"
elif (value > 70.0 and value <= 120.0):
return "3 Moderate"
elif (value > 120.0 and value <= 160.0):
return "4 Sufficient"
elif (value > 160.0 and value <= 240.0):
return "5 Bad"
elif (value > 240.0):
return "6 Very bad"
else:
return value
def PM10qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 20.0):
return "1 Very good"
elif (value > 20.0 and value <= 60.0):
return "2 Good"
elif (value > 60.0 and value <= 100.0):
return "3 Moderate"
elif (value > 100.0 and value <= 140.0):
return "4 Sufficient"
elif (value > 140.0 and value <= 200.0):
return "5 Bad"
elif (value > 200.0):
return "6 Very bad"
else:
return value
def PM25qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 12.0):
return "1 Very good"
elif (value > 12.0 and value <= 36.0):
return "2 Good"
elif (value > 36.0 and value <= 60.0):
return "3 Moderate"
elif (value > 60.0 and value <= 84.0):
return "4 Sufficient"
elif (value > 84.0 and value <= 120.0):
return "5 Bad"
elif (value > 120.0):
return "6 Very bad"
else:
return value
def SO2qual (value):
if (value < 0.0):
return np.NaN
elif (value >= 0.0 and value <= 50.0):
return "1 Very good"
elif (value > 50.0 and value <= 100.0):
return "2 Good"
elif (value > 100.0 and value <= 200.0):
return "3 Moderate"
elif (value > 200.0 and value <= 350.0):
return "4 Sufficient"
elif (value > 350.0 and value <= 500.0):
return "5 Bad"
elif (value > 500.0):
return "6 Very bad"
else:
return value
In [77]:
descriptiveFrame = pd.DataFrame()
In [78]:
for pollutant in bigDataFrame.columns:
reducedDataFrame[pollutant+".desc"] = reducedDataFrame[pollutant].apply(lambda x: globals()[pollutant+"qual"](x))
tmpseries = reducedDataFrame.groupby(level="Station")[pollutant+".desc"].value_counts(dropna = False).apply(lambda x: (x/float(hours))*100)
descriptiveFrame = pd.concat([descriptiveFrame, tmpseries], axis=1)
In [79]:
qualities = sorted(descriptiveFrame.index.get_level_values(1).unique().tolist())
In [80]:
for quality in qualities:
reducedDataFrame.loc[(reducedDataFrame[["C6H6.desc", "CO.desc", "NO2.desc", "O3.desc", "PM10.desc",
"PM25.desc", "SO2.desc"]] == quality).any(axis=1),"overall"] = quality
In [81]:
descriptiveFrame.columns = bigDataFrame.columns
In [82]:
overall = reducedDataFrame.groupby(level="Station")["overall"].value_counts(dropna =
False).apply(lambda x: (x/float(hours))*100)
descriptiveFrame = pd.concat([descriptiveFrame, overall], axis=1)
descriptiveFrame.rename(columns={0: "overall"}, inplace=True)
In [100]:
worstPlace = descriptiveFrame.xs('6 Very bad', level=1)["overall"].idxmax()
bestPlace = descriptiveFrame.xs('1 Very good', level=1)["overall"].idxmax()
In [101]:
descriptiveFrame.xs(worstPlace, level=0)
Out[101]:
In [102]:
descriptiveFrame.xs(bestPlace, level=0)
Out[102]:
In [104]:
worstPlace, bestPlace
Out[104]:
In [73]:
stations = pd.read_excel("../input/Metadane_wer20160914.xlsx")
coolStation = [u'Gdańsk', u'Gdynia', u'Sopot', u'Kościerzyna']
selectedStations = stations[stations[u'Miejscowość'].isin(coolStation)]
stationCodes = set(list(selected_stations[u'Kod stacji'].values) + list(selected_stations[u'Stary Kod stacji'].values))
In [74]:
reducedDataFrame = bigDataFrame['2015-01-01 01:00:00':'2016-01-01 00:00:00'].loc[(slice(None),stationCodes), :]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: