In [26]:
import os
import datetime as dt
import time
import re
startdate = dt.date(2016,7,7)
enddate = dt.date(2016,9,15)
r = re.compile('[0-9]+')#regular expression to extract integer string
d = re.compile('>[ ]*([0-9]+[.]?[0-9]*)[ ]*<')
t = re.compile('[0-9]{1,2}:[0-9]{2} [AP]M')#regular expression to extract time string
curdate = startdate
#datestr = 20160707
while curdate <= enddate:
windspeeds = []
times = []
datestr = str(curdate).replace('-','')
filename = datestr + '.html'
#get the wind speed values
lines_after_heading = 40 #initilize to some value larger than 5
with open(filename,'r') as html:
#investigating the html that we found that we can look for values on aline per line basis
for line in html.readlines():
lines_after_heading += 1
#get wind speed
#wind speed is the 5th line from the deading-cell
if lines_after_heading == 5:
if 'data-cell' in line and '<span class="table-unit">mph</span>' in line:
#add value to wind speed list
windspeeds.append(d.search(line).group(1))
continue
#or
#get the times
if 'heading-cell' in line and t.search(line) != None:
times.append(t.search(line).group())
lines_after_heading = 0
continue
#check that times and solarradiation have the same length
if len(windspeeds) != len(times):
print('times and radiation records have diffrent sizes for: ',filename,' ...skipping...')
continue
#if psizes match we assume the data was extracted correctly and write it to file
with open('histwind','a') as out:
for time,windspeed in zip(times,windspeeds):
out.write(str(curdate) + ' ' + time + ',' + windspeed + '\n')
#increment date by one day
curdate += dt.timedelta(days=1)
In [ ]: