In [1]:
%matplotlib inline

In [2]:
import os, re
import pandas as pd
import numpy as np

In [3]:
wdir = "./weather/"
hash_date = {}
for fn in os.listdir(wdir):
    if re.search(".csv", fn):
        wfn = wdir+fn
        stat_date = fn.split(".")[0]
        stat = stat_date.split("-")[0]
        mdate = "-".join(stat_date.split("-")[1:])
        
        # check 是否是有 date 
        if mdate in hash_date:
            1
        else:
            hash_date[mdate] = {}
        
        # check 是否是有 stat 
        if stat in hash_date[mdate]:
            1
        else:
            hash_date[mdate][stat] = pd.read_csv(wfn, encoding="UTF8")

In [4]:
print ", ".join(hash_date.keys())


2014-09, 2014-08, 2014-05, 2014-04, 2014-07, 2014-06, 2014-01, 2014-03, 2014-02, 2013-04, 2013-08, 2013-09, 2015-10, 2015-11, 2013-05, 2013-06, 2013-07, 2013-01, 2013-02, 2013-03, 2015-12, 2016-10, 2016-11, 2016-12, 2015-06, 2015-07, 2015-04, 2015-05, 2015-02, 2015-03, 2015-01, 2014-12, 2014-10, 2014-11, 2015-08, 2015-09, 2013-12, 2013-11, 2013-10, 2016-07, 2016-06, 2016-05, 2016-04, 2016-03, 2016-02, 2016-01, 2016-00, 2016-09, 2016-08

In [5]:
no_stat = [ '467620', '467300', '467550', '466950', 'C0S730','C0H9C0' ]

df_weather = pd.DataFrame(columns=('date', 'avg_max_temp', 'avg_min_temp', 'avg_rain', 'max_wind'))
idx = 0

for month in hash_date.keys()[:]:
    for dd in hash_date[month][ hash_date[month].keys()[0] ].index:
        
        max_wind = 0
        avg_max_temp = []
        avg_min_temp = []
        avg_rain = []
        for stat in hash_date[month].keys()[:]:
            
            if stat in no_stat:
                continue 
                
            mpd = hash_date[month][stat]
            if dd in mpd.index:
                rows = [ x if not x == u'0.0' else 0. for x in mpd.loc[dd]]
                
                if rows[17] > max_wind:
                    max_wind = rows[17]
                    if max_wind>10:
                        print stat, max_wind
                avg_max_temp.append( rows[9])
                avg_min_temp.append( rows[11])
                avg_rain.append( rows[22])
        df_weather.loc[idx] = ["%s-%s"%(month, dd), np.average(avg_max_temp), np.average(avg_min_temp), np.average(avg_rain), max_wind]
        idx = idx + 1


/opt/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:20: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
467050 10.3
467050 10.3
467770 10.1
467050 10.6
467050 12.6
467050 10.7
467050 10.1
467050 11.4
467590 11.6
467050 11.0
467050 11.7
467050 11.5
467050 10.2
467050 11.9
467770 10.1
467050 11.0
467050 10.3
467050 10.1
467050 10.3
466910 10.8
467050 11.5
467050 10.4
467050 10.1
467050 10.3
467050 11.0
467050 11.5
467050 10.3
467050 10.2
467050 10.1
467050 11.2
467050 11.0
467050 10.5
467050 11.7
467050 11.6
467770 11.1
467050 11.0
467770 11.2
467050 11.5
467350 11.1
467060 16.3
467770 10.2
C0M710 10.1
467060 13.2
467050 14.6
467770 14.7
467050 10.6
C0D560 10.1
467050 10.2
467660 11.7
C0X310 12.0
467050 11.3
467050 10.3
467050 10.2
467050 11.0
467050 11.2
467050 11.2
C0K291 10.1
467050 10.9
467770 11.9
467050 10.2

In [6]:
df_weather['avg_max_temp'].plot()


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f992bfe7ed0>

In [7]:
df_weather.to_csv("testing_weather.csv")

In [8]:
df_weather = pd.read_csv("testing_weather.csv")

In [9]:
ax = df_weather[['avg_max_temp', 'avg_min_temp']].plot()
ax.set_xticklabels(df_weather.date)


Out[9]:
[<matplotlib.text.Text at 0x7f9928fc5650>,
 <matplotlib.text.Text at 0x7f9928f6de50>,
 <matplotlib.text.Text at 0x7f9928f67110>,
 <matplotlib.text.Text at 0x7f9928ef4a90>,
 <matplotlib.text.Text at 0x7f9928f06210>,
 <matplotlib.text.Text at 0x7f9928f06950>,
 <matplotlib.text.Text at 0x7f9928f0e0d0>,
 <matplotlib.text.Text at 0x7f9928f0e810>]

In [10]:
df_weather['avg_max_temp'].describe()


Out[10]:
count    1312.000000
mean       23.253370
std         4.979950
min         6.702500
25%        19.318690
50%        24.093119
75%        27.524567
max        31.835833
Name: avg_max_temp, dtype: float64

In [11]:
df_weather['avg_min_temp'].describe()


Out[11]:
count    1312.000000
mean       16.604408
std         4.490012
min         3.461345
25%        12.635548
50%        17.508903
75%        20.493597
max        23.673333
Name: avg_min_temp, dtype: float64

In [12]:
df_weather['avg_rain'].describe()


Out[12]:
count    1312.000000
mean        6.025007
std        13.125653
min         0.000000
25%         0.393332
50%         1.760844
75%         6.229387
max       172.504348
Name: avg_rain, dtype: float64

In [13]:
df_weather['max_wind'].describe()


Out[13]:
count    1312.000000
mean        5.865930
std         2.250408
min         2.100000
25%         4.000000
50%         5.500000
75%         7.500000
max        16.300000
Name: max_wind, dtype: float64

In [14]:
def label_max_temp(atemp):
    low_bound = df_weather['avg_max_temp'].describe()['25%']
    high_bound = df_weather['avg_max_temp'].describe()['75%']
    
    if atemp < low_bound:
        return "L"
    if atemp >= low_bound and atemp < high_bound:
        return "M"
    if atemp > high_bound:
        return "H"

In [15]:
def label_min_temp(atemp):
    low_bound = df_weather['avg_min_temp'].describe()['25%']
    high_bound = df_weather['avg_min_temp'].describe()['75%']
    
    if atemp < low_bound:
        return "L"
    if atemp >= low_bound and atemp < high_bound:
        return "M"
    if atemp > high_bound:
        return "H"

In [16]:
def label_avg_rain(atemp):
    low_bound = df_weather['avg_rain'].describe()['25%']
    high_bound = df_weather['avg_rain'].describe()['75%']
    
    if atemp < low_bound:
        return "L"
    if atemp >= low_bound and atemp < high_bound:
        return "M"
    if atemp > high_bound:
        return "H"

In [17]:
def label_max_wind(atemp):
    low_bound = df_weather['max_wind'].describe()['25%']
    high_bound = df_weather['max_wind'].describe()['75%']
    
    if atemp < low_bound:
        return "L"
    if atemp >= low_bound and atemp < high_bound:
        return "M"
    if atemp > high_bound:
        return "H"

In [18]:
df_weather['lbl_max_temp'] = [label_max_temp(x) for x in df_weather['avg_max_temp']]

In [19]:
df_weather['lbl_min_temp'] = [label_min_temp(x) for x in df_weather['avg_min_temp']]

In [20]:
df_weather['lbl_avg_rain'] = [label_avg_rain(x) for x in df_weather['avg_rain']]

In [21]:
df_weather['lbl_max_wind'] = [label_max_wind(x) for x in df_weather['max_wind']]

In [22]:
df_weather['max_wind'].describe()


Out[22]:
count    1312.000000
mean        5.865930
std         2.250408
min         2.100000
25%         4.000000
50%         5.500000
75%         7.500000
max        16.300000
Name: max_wind, dtype: float64

In [23]:
df_weather['lbl_max_wind'].value_counts()


Out[23]:
M    653
L    327
H    319
Name: lbl_max_wind, dtype: int64

In [24]:
df_weather.sort_values(['date'])


Out[24]:
Unnamed: 0 date avg_max_temp avg_min_temp avg_rain max_wind lbl_max_temp lbl_min_temp lbl_avg_rain lbl_max_wind
517 517 2013-01-0 11.743158 7.882105 3.627368 3.8 L L M L
518 518 2013-01-1 15.709474 10.197895 0.774737 5.5 L L M M
527 527 2013-01-10 15.695789 9.425263 2.901053 6.2 L L M M
528 528 2013-01-11 14.658947 11.333684 10.854737 8.3 L L H H
529 529 2013-01-12 14.546316 10.003158 11.464211 6.6 L L H M
530 530 2013-01-13 13.172632 8.042105 0.397895 8.3 L L M H
531 531 2013-01-14 16.256842 7.766316 0.005263 5.2 L L L M
532 532 2013-01-15 16.474737 8.344211 2.792632 5.0 L L M M
533 533 2013-01-16 12.858947 7.944211 1.929474 8.7 L L M H
534 534 2013-01-17 11.618947 7.433684 0.176842 7.9 L L L H
535 535 2013-01-18 15.727368 7.592632 0.138947 5.8 L L L M
536 536 2013-01-19 17.133684 9.269474 2.295789 4.1 L L M M
519 519 2013-01-2 13.447368 10.011579 9.847368 9.0 L L H H
537 537 2013-01-20 18.596842 10.765263 1.484211 3.4 L L M L
538 538 2013-01-21 17.998947 11.635789 0.366316 5.5 L L L M
539 539 2013-01-22 15.248421 11.616842 1.245263 7.3 L L M M
540 540 2013-01-23 16.349474 10.982105 0.385263 5.4 L L L M
541 541 2013-01-24 13.811579 10.100000 1.717895 8.7 L L M H
542 542 2013-01-25 14.873684 9.676842 1.147368 6.0 L L M M
543 543 2013-01-26 13.496842 9.575789 1.889474 7.8 L L M H
544 544 2013-01-27 14.854737 8.892632 0.281053 6.3 L L L M
545 545 2013-01-28 16.565263 8.896842 0.042105 3.7 L L L L
546 546 2013-01-29 17.130526 9.069474 0.226316 4.9 L L L M
520 520 2013-01-3 14.110526 9.501053 5.434737 7.2 L L M M
547 547 2013-01-30 18.287368 9.113684 0.000000 4.3 L L L M
521 521 2013-01-4 14.681053 10.364211 4.203158 6.9 L L M M
522 522 2013-01-5 16.835789 11.356842 2.687368 6.0 L L M M
523 523 2013-01-6 16.027368 11.806316 1.596842 7.0 L L M M
524 524 2013-01-7 14.078947 11.498947 7.029474 6.3 L L H M
525 525 2013-01-8 12.255789 9.985263 7.702105 8.3 L L H H
... ... ... ... ... ... ... ... ... ... ...
1108 1108 2016-07-13 30.635000 23.353333 1.634167 7.3 H H M M
1109 1109 2016-07-14 30.307500 23.150833 0.095833 8.9 H H L H
1110 1110 2016-07-15 30.099167 22.730000 0.600000 8.5 H H M H
1111 1111 2016-07-16 30.790000 23.463333 2.955833 5.1 H H M M
1112 1112 2016-07-17 30.262500 22.252500 10.691667 4.2 H H H M
1113 1113 2016-07-18 29.883333 21.750000 7.045833 5.8 H H H M
1114 1114 2016-07-19 29.780833 21.763333 1.669167 6.9 H H M M
1097 1097 2016-07-2 30.057500 21.993333 8.795833 6.1 H H H M
1115 1115 2016-07-20 29.335000 21.722500 0.889167 6.2 H H M M
1116 1116 2016-07-21 31.013333 23.029167 1.050000 5.3 H H M M
1117 1117 2016-07-22 31.088333 22.627500 0.191667 4.1 H H L M
1118 1118 2016-07-23 30.424167 22.732500 0.270833 2.9 H H L L
1119 1119 2016-07-24 29.905833 22.707500 0.729167 3.9 H H M L
1120 1120 2016-07-25 30.083333 22.594167 0.508333 3.1 H H M L
1121 1121 2016-07-26 31.258333 22.956667 0.583333 6.4 H H M M
1122 1122 2016-07-27 31.710000 22.810000 1.312500 5.3 H H M M
1123 1123 2016-07-28 31.499167 22.705000 3.995833 3.0 H H M L
1124 1124 2016-07-29 30.570000 22.405000 2.965833 4.4 H H M M
1098 1098 2016-07-3 30.404167 22.035000 4.891667 4.8 H H M M
1125 1125 2016-07-30 29.334167 21.688333 5.228333 7.3 H H M M
1099 1099 2016-07-4 30.235000 22.248333 0.186667 5.3 H H L M
1100 1100 2016-07-5 31.521667 23.673333 3.024167 4.6 H H M M
1101 1101 2016-07-6 29.758333 22.907500 4.900000 7.8 H H M H
1102 1102 2016-07-7 25.826667 21.942500 73.234167 12.0 M H H H
1103 1103 2016-07-8 28.195000 22.473333 41.610833 7.0 H H H M
1104 1104 2016-07-9 28.353333 22.444167 27.856667 5.6 H H H M
1308 1308 2016-08-0 28.482500 22.454167 22.309167 7.0 H H H M
1309 1309 2016-08-1 30.759167 22.625000 9.970833 5.1 H H H M
1310 1310 2016-08-2 30.290833 22.627500 1.808333 3.9 H H M L
1311 1311 2016-08-3 30.984167 23.248333 0.645833 3.9 H H M L

1312 rows × 10 columns


In [ ]: