In [1]:
%matplotlib inline
In [2]:
import os, re
import pandas as pd
import numpy as np
In [3]:
wdir = "./weather/"
hash_date = {}
for fn in os.listdir(wdir):
if re.search(".csv", fn):
wfn = wdir+fn
stat_date = fn.split(".")[0]
stat = stat_date.split("-")[0]
mdate = "-".join(stat_date.split("-")[1:])
# check 是否是有 date
if mdate in hash_date:
1
else:
hash_date[mdate] = {}
# check 是否是有 stat
if stat in hash_date[mdate]:
1
else:
hash_date[mdate][stat] = pd.read_csv(wfn, encoding="UTF8")
In [4]:
print ", ".join(hash_date.keys())
In [5]:
no_stat = [ '467620', '467300', '467550', '466950', 'C0S730','C0H9C0' ]
df_weather = pd.DataFrame(columns=('date', 'avg_max_temp', 'avg_min_temp', 'avg_rain', 'max_wind'))
idx = 0
for month in hash_date.keys()[:]:
for dd in hash_date[month][ hash_date[month].keys()[0] ].index:
max_wind = 0
avg_max_temp = []
avg_min_temp = []
avg_rain = []
for stat in hash_date[month].keys()[:]:
if stat in no_stat:
continue
mpd = hash_date[month][stat]
if dd in mpd.index:
rows = [ x if not x == u'0.0' else 0. for x in mpd.loc[dd]]
if rows[17] > max_wind:
max_wind = rows[17]
if max_wind>10:
print stat, max_wind
avg_max_temp.append( rows[9])
avg_min_temp.append( rows[11])
avg_rain.append( rows[22])
df_weather.loc[idx] = ["%s-%s"%(month, dd), np.average(avg_max_temp), np.average(avg_min_temp), np.average(avg_rain), max_wind]
idx = idx + 1
In [6]:
df_weather['avg_max_temp'].plot()
Out[6]:
In [7]:
df_weather.to_csv("testing_weather.csv")
In [8]:
df_weather = pd.read_csv("testing_weather.csv")
In [9]:
ax = df_weather[['avg_max_temp', 'avg_min_temp']].plot()
ax.set_xticklabels(df_weather.date)
Out[9]:
In [10]:
df_weather['avg_max_temp'].describe()
Out[10]:
In [11]:
df_weather['avg_min_temp'].describe()
Out[11]:
In [12]:
df_weather['avg_rain'].describe()
Out[12]:
In [13]:
df_weather['max_wind'].describe()
Out[13]:
In [14]:
def label_max_temp(atemp):
low_bound = df_weather['avg_max_temp'].describe()['25%']
high_bound = df_weather['avg_max_temp'].describe()['75%']
if atemp < low_bound:
return "L"
if atemp >= low_bound and atemp < high_bound:
return "M"
if atemp > high_bound:
return "H"
In [15]:
def label_min_temp(atemp):
low_bound = df_weather['avg_min_temp'].describe()['25%']
high_bound = df_weather['avg_min_temp'].describe()['75%']
if atemp < low_bound:
return "L"
if atemp >= low_bound and atemp < high_bound:
return "M"
if atemp > high_bound:
return "H"
In [16]:
def label_avg_rain(atemp):
low_bound = df_weather['avg_rain'].describe()['25%']
high_bound = df_weather['avg_rain'].describe()['75%']
if atemp < low_bound:
return "L"
if atemp >= low_bound and atemp < high_bound:
return "M"
if atemp > high_bound:
return "H"
In [17]:
def label_max_wind(atemp):
low_bound = df_weather['max_wind'].describe()['25%']
high_bound = df_weather['max_wind'].describe()['75%']
if atemp < low_bound:
return "L"
if atemp >= low_bound and atemp < high_bound:
return "M"
if atemp > high_bound:
return "H"
In [18]:
df_weather['lbl_max_temp'] = [label_max_temp(x) for x in df_weather['avg_max_temp']]
In [19]:
df_weather['lbl_min_temp'] = [label_min_temp(x) for x in df_weather['avg_min_temp']]
In [20]:
df_weather['lbl_avg_rain'] = [label_avg_rain(x) for x in df_weather['avg_rain']]
In [21]:
df_weather['lbl_max_wind'] = [label_max_wind(x) for x in df_weather['max_wind']]
In [22]:
df_weather['max_wind'].describe()
Out[22]:
In [23]:
df_weather['lbl_max_wind'].value_counts()
Out[23]:
In [24]:
df_weather.sort_values(['date'])
Out[24]:
In [ ]: