In [117]:
import os
op = os.path
import pandas as pd
from pandas import DataFrame
import re
In [2]:
pwd
Out[2]:
In [3]:
raw_data_dir = "101年 北部空品區"
In [4]:
raw_xls_list = [op.join(raw_data_dir, f) for f in os.listdir(raw_data_dir) if f.endswith('.xls')]
In [111]:
def data_parser(df, loc):
for d in df.groupby('測項'):
#print(d[1].iloc[300, :].values)
df_part = d[1].iloc[:, 3:].convert_objects(convert_numeric=True)
yield DataFrame({
'loc': [loc] * 24,
'pollutant': [d[0]] * 24,
'hour': list(range(1, 25)),
'value': df_part.mean()
})
In [136]:
def parse_xls(file_path):
df = pd.read_excel(file_path, "Sheet1")
df_filtered = df[(df['測項'] == 'PM10')|(df['測項'] == 'SO2')|(df['測項'] == 'NO2')]
loc = re.search(r"/101年(.*)站_2013.*", file_path, re.UNICODE).group(1)
return pd.concat(list(data_parser(df_filtered, loc)))
In [137]:
# This step takes long time
df_list = [parse_xls(f) for f in raw_xls_list]
In [138]:
df_final = pd.concat(df_list)
In [139]:
df_final.to_csv("air_nothern_2012.csv", index=False)
In [ ]: