In [117]:
import os
op = os.path
import pandas as pd
from pandas import DataFrame
import re

In [2]:
pwd


Out[2]:
'/Users/liang/code/2013-RConf-ggplot2-intro/Rcode/play_air_map'

In [3]:
raw_data_dir = "101年 北部空品區"

In [4]:
raw_xls_list = [op.join(raw_data_dir, f) for f in os.listdir(raw_data_dir) if f.endswith('.xls')]

In [111]:
def data_parser(df, loc):
    for d in df.groupby('測項'):
        #print(d[1].iloc[300, :].values)
        df_part = d[1].iloc[:, 3:].convert_objects(convert_numeric=True)
        yield DataFrame({
            'loc': [loc] * 24,
            'pollutant': [d[0]] * 24,
            'hour': list(range(1, 25)),
            'value': df_part.mean()
        })

In [136]:
def parse_xls(file_path):
    df = pd.read_excel(file_path, "Sheet1")
    df_filtered = df[(df['測項'] == 'PM10')|(df['測項'] == 'SO2')|(df['測項'] == 'NO2')]
    loc = re.search(r"/101年(.*)站_2013.*", file_path, re.UNICODE).group(1)
    return pd.concat(list(data_parser(df_filtered, loc)))

In [137]:
# This step takes long time
df_list = [parse_xls(f) for f in raw_xls_list]

In [138]:
df_final = pd.concat(df_list)

In [139]:
df_final.to_csv("air_nothern_2012.csv", index=False)

In [ ]: