In [26]:

    
import os
import datetime as dt
import time
import re

startdate = dt.date(2016,7,7)
enddate   = dt.date(2016,9,15)

r = re.compile('[0-9]+')#regular expression to extract integer string
d = re.compile('>[ ]*([0-9]+[.]?[0-9]*)[ ]*<')
t = re.compile('[0-9]{1,2}:[0-9]{2} [AP]M')#regular expression to extract time string

curdate = startdate
#datestr = 20160707

while curdate <= enddate:
    
    windspeeds = []
    times     = []
    
    datestr = str(curdate).replace('-','')
    filename = datestr + '.html'
    
    #get the wind speed values
    lines_after_heading = 40 #initilize to some value larger than 5 
    with open(filename,'r') as html:
        #investigating the html that we found that we can look for values on aline per line basis
        for line in html.readlines():
            
            lines_after_heading += 1
            
            #get wind speed
            #wind speed is the 5th line from the deading-cell
            if lines_after_heading == 5:
                if 'data-cell' in line and '<span class="table-unit">mph</span>' in line:

                    #add value to wind speed list
                    windspeeds.append(d.search(line).group(1))
                    continue
            
            #or
            #get the times
            if 'heading-cell' in line and t.search(line) != None:
                
                times.append(t.search(line).group())
                lines_after_heading = 0
                continue
    
    #check that times and solarradiation have the same length
    if len(windspeeds) != len(times):
        print('times and radiation records have diffrent sizes for: ',filename,' ...skipping...')
        
        continue

    #if psizes match we assume the data was extracted correctly and write it to file
    with open('histwind','a') as out:
        for time,windspeed in zip(times,windspeeds):
            out.write(str(curdate) + ' ' + time + ',' + windspeed + '\n')
    
    #increment date by one day
    curdate += dt.timedelta(days=1)