In [ ]:
from imports import *
In [ ]:
import re
import datetime as dt
import time
from collections import defaultdict
import json
# import requests
import requests_cache
from requests_cache import CachedSession
ss = CachedSession('pollen_darksky', backend='sqlite')
from bs4 import BeautifulSoup
# requests_cache.install_cache('pollen_darksky', backend='sqlite')
%matplotlib inline
# %mkdir cache
import joblib; mem = joblib.Memory(cachedir='cache2')
In [ ]:
from requests_cache.backends.redis import RedisDict
rd = RedisDict('red-cache')
import torch as T
%load_ext line_profiler
In [ ]:
import util.pollen_utils; reload(util.pollen_utils); from util.pollen_utils import (
pollen_url, pollen_date2df as pollen_date2df_, pollen_data as pollen_data_
)
import util.utils; reload(util.utils); from util.utils import (
check_one2one, yrmths, flatten_multindex, ends_with,
batchify, test_batchify, collapse_dims, test_collapse_dims_,
batch_getterer,
to_sub_seqs, test_to_sub_seqs,
ravel, repackage_hidden, mse
)
test_to_sub_seqs()
test_batchify()
pollen_date2df = mem.cache(pollen_date2df_)
pollen_data = mem.cache(pollen_data_)
test_collapse_dims_(T)
;;
In [ ]:
pd.options.display.max_columns = 40
In [ ]:
# u = pollen_url.format(year=2014, month=1)
for yr in range(2000, 2018):
for m in range(1, 13):
u = url.format(year=yr, month=m)
r = requests.get(u)
print(yr, m, end='; ')
In [ ]:
# Sometimes Jan calendar will have counts from days
# at beginning of Feb or end or previous Dec.
# Just checking that they agree w/ numbers in
# those months' calendars before dropping dupe
# dates
poldf = pollen_data(yrmths)
check_one2one(poldf, 'Date', 'Count')
poldf = poldf.drop_duplicates('Date').reset_index(drop=1)
poldf.loc[poldf.Count == -1, 'Count'] = np.nan
poldf = poldf.dropna(axis=0)
poldf.Date = pd.to_datetime(poldf.Date)
poldf = poldf.assign(
Yr =lambda x: x.Date.dt.year,
M =lambda x: x.Date.dt.month,
D =lambda x: x.Date.dt.day,
Doy=lambda x: x.Date.dt.dayofyear,
)
poldf['Dayi'] = (poldf.Date - poldf.Date.min()).dt.days
poldf['Prev_cnt'] = poldf.Count.shift(1).fillna(method='ffill').fillna(0) #.interpolate()
poldf['Prev_cnt_null'] = poldf.Dayi.shift() != (poldf.Dayi - 1)
In [ ]:
from pandas.util.testing import assert_frame_equal
In [ ]:
No = len(poldf2)
In [ ]:
assert_frame_equal(poldf.reset_index(drop=1)[:No], poldf2.reset_index(drop=1))
In [ ]:
feather.write_dataframe(poldf, 'cache/pollen.fth')
In [ ]:
poldf2 = feather.read_dataframe('cache/pollen.fth')
cs = requests.session()
u = pollen_url.format(year=2017, month=4)
cs.cache.delete_url(u)
r = requests.get(u)
!rm -r cache/joblib/util/pollen_utils/pollen_date2df/
!rm -r cache/joblib/util/pollen_utils/pollen_data/
for yr in range(2000, 2018):
for m in range(1, 13):
u = url.format(year=yr, month=m)
r = requests.get(u)
print(yr, m, end='; ')
r = requests.get(u) soup = BeautifulSoup(r.content, "lxml") sel = 'div.calendar-row.calendar-row-4 > div > div > span.count > a' soup.select_one(sel)
In [ ]:
def mkds_url(sdate):
loc = '33.7490,-84.3880'
url_temp = 'https://api.darksky.net/forecast/{key}/{loc},{time}?exclude=flags'
return url_temp.format(key=key, loc=loc, time=mktime(sdate))
with open('KEY.txt', 'r') as f:
key = f.read().strip()
In [ ]:
def date_rng_gen(start):
if isinstance(start, str):
start = dt.datetime.strptime(start, '%Y-%m-%d')
while 1:
# print(start)
rng = gen_yr_rng(start, backward=True)
# print(rng[:5])
start = rng[-1]
for d in rng[:-1]:
yield d
def gen_yr_rng(start, backward=True):
start = start
yr = start.year
dprev = start.replace(year=yr - 1)
rng = pd.date_range(start=dprev, end=start)
if backward:
return rng[::-1]
return rng
d2 = dt.datetime(*start.timetuple()[:-3])
def test_date_rng_gen():
ts = list(it.islice(date_rng_gen('2017-03-01'), 400))
a, b = ts[0], ts[-1]
assert (pd.date_range(b, a)[::-1] == ts).all()
s = pd.Series(ts)
assert s.value_counts(normalize=0).max() == 1
def mktime(s, hour=12):
# tm
try:
d = dt.datetime.strptime(s, "%Y-%d-%m")
except TypeError:
d = s
# if isinstance(s, str):
# else:
# d = s
if hour:
d = d.replace(hour=hour)
f = time.mktime(d.timetuple())
return int(f)
test_date_rng_gen()
In [ ]:
def camel2score(s):
return s[0].upper() + ''.join([('_' + c.lower()) if c.isupper() else c for c in s[1:]])
def parse_data(j):
hrs = j.pop('hourly')['data']
# global hrdf
js = json.dumps(hrs)
hrdf = pd.read_json(js) #.rename(columns=camel2score)
_dl = j.pop('daily')
[[dl]] = _dl.values()
cr = j.pop('currently')
# if hrdf.shape != (24,15):
# print(hrdf.shape)
# assert hrdf.shape == (24,15), 'Hr shape: {}'.format(hrdf.shape)
assert sorted(j) == ['latitude', 'longitude', 'offset', 'timezone']
return hrdf, dl, cr, j
# hrdf, dl, cr, j = parse_data(r.json())
In [ ]:
today = '2017-04-05'
In [ ]:
# Pull new
for d in date_rng_gen(today):
if d.month == d.day == 1:
print(d.date())
u = mkds_url(d)
if (u in rd) or ss.cache.has_url(u):
print('.', end='')
continue
r = ss.get(u)
if r.status_code == 403:
print('Forbidden')
break
if r.status_code != 200:
print(d, r.status_code)
In [ ]:
r = ss.get(u)
In [ ]:
r.url
In [ ]:
def sync_rdd(ss, rd, stdate='2017-03-01'):
n = 0
for d in date_rng_gen(stdate):
if d.month == d.day == 1:
print(d.date(), end=' ')
u = mkds_url(d)
if u in rd:
continue
if not ss.cache.has_url(u):
break
r = ss.get(u)
assert u == r.url
rd[r.url] = r.json()
print('.', end='')
n += 1
return n
sync_rdd(ss, rd, stdate=today)
In [ ]:
def accum(stdate='2017-03-01'):
"""Roll through the dates, pull out cached
requests, and add parsed data to list"""
dat = []
for i, d in enumerate(date_rng_gen(stdate)):
# if i > 20:
# break
if d.month == d.day == 1:
print(d.date())
u = mkds_url(d)
# if ss.cache.has_url(u):
# r = ss.get(u)
# dat.append(parse_data(r.json()))
if u in rd:
j = rd[u]
parsed = parse_data(j)
dat.append(parsed)
# print('.', end='')
else:
break
return dat
return
%lprun -f pd.read_json accum(stdate='2017-03-01')
In [ ]:
dat = accum(stdate='2017-04-04')
In [ ]:
hrdfs, dls, crs, _ = zip(*dat)
In [ ]:
dailydf = pd.read_json(json.dumps(dls)).rename(columns=camel2score)
In [ ]:
def concat(dfs):
all_cols = {c for df in dfs for c in df}
col_dat = defaultdict(list)
for cname in all_cols:
for df in dfs:
l = len(df)
col_dat[cname].extend(df.get(cname, [None] * l))
return DataFrame(col_dat)
%time hr_df = concat(hrdfs).rename(columns=camel2score)
In [ ]:
feather.write_dataframe(dailydf, 'cache/dark_day.fth')
# feather.write_dataframe(hr_df, 'cache/dark_hr.fth')
In [ ]:
hr_dat[:2]
In [ ]:
def rep_with_dummies_(df, col):
df = df.copy()
newcs = pd.get_dummies(df[col])
for c in newcs:
df[c] = newcs[c]
return df.drop(col, axis=1)
def rep_with_dummies(df, cols):
for c in cols:
df = rep_with_dummies_(df, c)
return df
hr_dat2 = rep_with_dummies(hr_dat, ['Icon', 'Summary', 'Precip_type'])
In [ ]:
ns = (~(hr_dat == hr_dat))
ns.sum()
In [ ]:
ncols = ns.sum()[ns.sum() > 0]
In [ ]:
cn = hr_dat.eval('Cloud_cover != Cloud_cover').astype(int)
In [ ]:
In [ ]:
hr_dat2.corrwith(cn).sort_values(ascending=True)
In [ ]:
hr_dat2[:3]
In [ ]:
In [ ]:
In [ ]: