In [2]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline

In [37]:
if 'bigDataFrame' in globals():
    print("Exist, do nothing!")
else:
    print("Read data.")
    bigDataFrame = pd.read_pickle("../output/bigDataFrame.pkl")
    bigDataFrame.rename(columns={"PM2.5": "PM25"}, inplace=True)


Exist, do nothing!

In [4]:
bigDataFrame.head()


Out[4]:
C6H6 CO NO2 O3 PM10 PM25 SO2
Hour Station
2000-01-01 01:00:00 Pm.a01a NaN NaN 25.0 NaN NaN NaN NaN
Pm.a03a NaN NaN NaN 38.0 NaN NaN NaN
2000-01-01 02:00:00 DsCzer02 NaN NaN 6.0 35.0 NaN NaN NaN
DsJelw05 NaN NaN 14.0 29.0 NaN NaN NaN
DsSniezka NaN NaN NaN 61.0 NaN NaN NaN

In [5]:
pollutedPlaces = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].idxmax()
print(pollutedPlaces)
pollutedPlaces = set([x[1] for x in pollutedPlaces])


C6H6    (2015-10-28 06:00:00, OpKKozBSmial)
CO      (2015-02-15 00:00:00, SlRybniBorki)
NO2     (2015-11-07 10:00:00, PmStaGdaLubi)
O3      (2015-07-04 12:00:00, LuZarySzyman)
PM10    (2015-12-31 20:00:00, SlRybniBorki)
PM25    (2015-03-22 23:00:00, MzLegZegrzyn)
SO2     (2015-05-21 23:00:00, MzPlocKroJad)
dtype: object

In [6]:
pollutedPlaces


Out[6]:
{u'LuZarySzyman',
 u'MzLegZegrzyn',
 u'MzPlocKroJad',
 u'OpKKozBSmial',
 u'PmStaGdaLubi',
 u'SlRybniBorki'}

In [7]:
#reducedDataFrame = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].loc[(slice(None),pollutedPlaces), :]

In [8]:
reducedDataFrame = bigDataFrame['2015-01-01 00:00:00':'2015-12-31 23:00:00'].loc[(slice(None), slice(None)), :]

In [75]:
hours = len(reducedDataFrame.index.get_level_values("Hour").unique())

In [76]:
def C6H6qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 5.0):
        return "1 Very good"
    elif (value > 5.0 and value <= 10.0):
        return "2 Good"
    elif (value > 10.0 and value <= 15.0):
        return "3 Moderate"
    elif (value > 15.0 and value <= 20.0):
        return "4 Sufficient"
    elif (value > 20.0 and value <= 50.0):
        return "5 Bad"
    elif (value > 50.0):
        return "6 Very bad"
    else:
        return value

def COqual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 2.0):
        return "1 Very good"
    elif (value > 2.0 and value <= 6.0):
        return "2 Good"
    elif (value > 6.0 and value <= 10.0):
        return "3 Moderate"
    elif (value > 10.0 and value <= 14.0):
        return "4 Sufficient"
    elif (value > 14.0 and value <= 20.0):
        return "5 Bad"
    elif (value > 20.0):
        return "6 Very bad"
    else:
        return value
    
def NO2qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 40.0):
        return "1 Very good"
    elif (value > 40.0 and value <= 100.0):
        return "2 Good"
    elif (value > 100.0 and value <= 150.0):
        return "3 Moderate"
    elif (value > 150.0 and value <= 200.0):
        return "4 Sufficient"
    elif (value > 200.0 and value <= 400.0):
        return "5 Bad"
    elif (value > 400.0):
        return "6 Very bad"
    else:
        return value    

def O3qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 30.0):
        return "1 Very good"
    elif (value > 30.0 and value <= 70.0):
        return "2 Good"
    elif (value > 70.0 and value <= 120.0):
        return "3 Moderate"
    elif (value > 120.0 and value <= 160.0):
        return "4 Sufficient"
    elif (value > 160.0 and value <= 240.0):
        return "5 Bad"
    elif (value > 240.0):
        return "6 Very bad"
    else:
        return value
    
def PM10qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 20.0):
        return "1 Very good"
    elif (value > 20.0 and value <= 60.0):
        return "2 Good"
    elif (value > 60.0 and value <= 100.0):
        return "3 Moderate"
    elif (value > 100.0 and value <= 140.0):
        return "4 Sufficient"
    elif (value > 140.0 and value <= 200.0):
        return "5 Bad"
    elif (value > 200.0):
        return "6 Very bad"
    else:
        return value
    
def PM25qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 12.0):
        return "1 Very good"
    elif (value > 12.0 and value <= 36.0):
        return "2 Good"
    elif (value > 36.0 and value <= 60.0):
        return "3 Moderate"
    elif (value > 60.0 and value <= 84.0):
        return "4 Sufficient"
    elif (value > 84.0 and value <= 120.0):
        return "5 Bad"
    elif (value > 120.0):
        return "6 Very bad"
    else:
        return value
    
def SO2qual (value):
    if (value < 0.0):
        return np.NaN
    elif (value >= 0.0 and value <= 50.0):
        return "1 Very good"
    elif (value > 50.0 and value <= 100.0):
        return "2 Good"
    elif (value > 100.0 and value <= 200.0):
        return "3 Moderate"
    elif (value > 200.0 and value <= 350.0):
        return "4 Sufficient"
    elif (value > 350.0 and value <= 500.0):
        return "5 Bad"
    elif (value > 500.0):
        return "6 Very bad"
    else:
        return value

In [77]:
descriptiveFrame = pd.DataFrame()

In [78]:
for pollutant in bigDataFrame.columns:
    reducedDataFrame[pollutant+".desc"] = reducedDataFrame[pollutant].apply(lambda x: globals()[pollutant+"qual"](x))
    tmpseries = reducedDataFrame.groupby(level="Station")[pollutant+".desc"].value_counts(dropna = False).apply(lambda x: (x/float(hours))*100)
    descriptiveFrame = pd.concat([descriptiveFrame, tmpseries], axis=1)

In [79]:
qualities = sorted(descriptiveFrame.index.get_level_values(1).unique().tolist())

In [80]:
for quality in qualities:
    reducedDataFrame.loc[(reducedDataFrame[["C6H6.desc", "CO.desc", "NO2.desc", "O3.desc", "PM10.desc", 
                                        "PM25.desc", "SO2.desc"]] == quality).any(axis=1),"overall"] = quality

In [81]:
descriptiveFrame.columns = bigDataFrame.columns

In [82]:
overall = reducedDataFrame.groupby(level="Station")["overall"].value_counts(dropna = 
                                                                            False).apply(lambda x: (x/float(hours))*100)
descriptiveFrame = pd.concat([descriptiveFrame, overall], axis=1)
descriptiveFrame.rename(columns={0: "overall"}, inplace=True)

In [100]:
worstPlace = descriptiveFrame.xs('6 Very bad', level=1)["overall"].idxmax()
bestPlace = descriptiveFrame.xs('1 Very good', level=1)["overall"].idxmax()

In [101]:
descriptiveFrame.xs(worstPlace, level=0)


Out[101]:
C6H6 CO NO2 O3 PM10 PM25 SO2 overall
NaN 82.636986 1.632420 5.502283 3.276256 16.803653 5.468037 6.107306 NaN
1 Very good NaN 80.011416 74.783105 18.264840 24.828767 30.605023 76.347032 2.157534
2 Good NaN 0.993151 2.340183 46.232877 33.493151 30.091324 0.182648 48.070776
3 Moderate NaN NaN 0.011416 13.835616 5.547945 9.977169 NaN 24.703196
4 Sufficient NaN NaN NaN 0.970320 1.164384 3.607306 NaN 4.668950
5 Bad NaN NaN NaN 0.057078 0.502283 1.837900 NaN 1.952055
6 Very bad NaN NaN NaN NaN 0.296804 1.050228 NaN 1.084475

In [102]:
descriptiveFrame.xs(bestPlace, level=0)


Out[102]:
C6H6 CO NO2 O3 PM10 PM25 SO2 overall
NaN 99.520548 1.586758 6.746575 99.520548 0.662100 99.520548 0.422374 NaN
1 Very good NaN 97.910959 90.000000 NaN 77.397260 NaN 99.098174 77.465753
2 Good NaN 0.022831 2.762557 NaN 20.913242 NaN NaN 21.495434
3 Moderate NaN NaN 0.011416 NaN 0.399543 NaN NaN 0.410959
4 Sufficient NaN NaN NaN NaN 0.136986 NaN NaN 0.136986
5 Bad NaN NaN NaN NaN 0.011416 NaN NaN 0.011416

In [104]:
worstPlace, bestPlace


Out[104]:
(u'PmKosTargo12', u'PmSopBitPl06')

Part for Tricity and Kashubia


In [73]:
stations = pd.read_excel("../input/Metadane_wer20160914.xlsx")
coolStation = [u'Gdańsk', u'Gdynia', u'Sopot', u'Kościerzyna']
selectedStations = stations[stations[u'Miejscowość'].isin(coolStation)]
stationCodes = set(list(selected_stations[u'Kod stacji'].values) + list(selected_stations[u'Stary Kod stacji'].values))

In [74]:
reducedDataFrame = bigDataFrame['2015-01-01 01:00:00':'2016-01-01 00:00:00'].loc[(slice(None),stationCodes), :]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: