To access the data and run the notebooks, you must perform the following steps:
1) Navigate to http://tiny.cc/JOC-XX-XXXX, where the XX-XXXX is the manuscript number on manuscript central.
2) Download the file, named data.tar.gz. It has the following checksum information
3) ungz and untar the folder 'data' into the mcs_future folder. These files represent post-processed data and not the entire dataset. This is due to the very large size of the three datasets, the raw derived data, and the amount of time it takes to run the training and classification. However, applying the methods of this paper to the publically available data will give you the same results. If you downloaded the entire github project, the directories for the data folder and its subfolders should be:
/MCS/mcs_future/data/raw_data
/MCS/mcs_future/data/shapefiles
/MCS/mcs_future/data/slice_data
/MCS/mcs_future/data/wrf_data
In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import sys
sys.path.append('C:/users/ahaberlie1/documents/github/MCS/')
data_dir = "../data/slice_data/"
data = {'mcs':{'OBS':None, 'CTRL':None, 'PGW':None},
'qlcs':{'OBS':None, 'CTRL':None, 'PGW':None},
'non_qlcs':{'OBS':None, 'CTRL':None, 'PGW':None}}
for subset in ['mcs', 'qlcs', 'non_qlcs']:
filename = data_dir + "new_{}_project.csv".format(subset)
df = pd.read_csv(filename)
df['datetime'] = pd.to_datetime(df.datetime)
df = df.set_index('datetime')
for dataset in ['OBS', 'CTRL', 'PGW']:
data[subset][dataset] = df[df.run.values==dataset].copy()
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import cartopy
import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
def draw_states(ax):
shapename = 'admin_1_states_provinces_lakes_shp'
states_shp = shpreader.natural_earth(resolution='50m',
category='cultural', name=shapename)
for state, info in zip(shpreader.Reader(states_shp).geometries(), shpreader.Reader(states_shp).records()):
if info.attributes['admin'] == 'United States of America':
ax.add_geometries([state], ccrs.PlateCarree(),
facecolor='None', edgecolor='k')
def draw_midwest(ax):
shapename = "../data/shapefiles/map/midwest_outline_latlon_grids"
shp = shpreader.Reader(shapename)
for outline, info in zip(shp.geometries(), shp.records()):
ax.add_geometries([outline], ccrs.PlateCarree(),
facecolor='None', edgecolor='k', linewidth=4)
def generate_view(w_lon, e_lon, n_lat, s_lat, from_proj, to_proj):
view = plt.axes([0,0,1,1], projection=to_proj)
view.set_extent([w_lon, e_lon, s_lat, n_lat])
shapename = 'admin_1_states_provinces_lakes_shp'
states_shp = shpreader.natural_earth(resolution='50m',
category='cultural', name=shapename)
for state, info in zip(shpreader.Reader(states_shp).geometries(), shpreader.Reader(states_shp).records()):
if info.attributes['admin'] == 'United States of America':
view.add_geometries([state], ccrs.PlateCarree(),
facecolor='None', edgecolor='k')
return view
In [3]:
from netCDF4 import Dataset
#grid lats and lons
nc = Dataset("../data/wrf_data/RALconus4km_wrf_constants.nc")
lons = nc.variables['XLONG'][:,:]
lats = nc.variables['XLAT'][:,:]
In [4]:
sys.path.append('C:/users/ahaberlie/documents/github/MCS/mcs_future')
import cartopy
import cartopy.crs as ccrs
from mcs_future.utils.mapping_help import wrf_to_lon_lat
import matplotlib.pyplot as plt
from netCDF4 import Dataset
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 10
from_proj = ccrs.PlateCarree()
to_proj = ccrs.AlbersEqualArea(central_longitude=-95, central_latitude=38.0000)
view = generate_view(-120, -73, 20, 50, from_proj, to_proj)
subset = 'mcs'
dataset = 'OBS'
df_ = data[subset][dataset].copy()
df_ = df_[df_.index.month.isin([6,7,8])]
for sid, swath in df_.groupby('storm_num'):
xp = np.array([np.mean([x1, x2]) for (x1, x2) in zip(swath.xmin.values, swath.xmax.values)])
yp = np.array([np.mean([y1, y2]) for (y1, y2) in zip(swath.ymin.values, swath.ymax.values)])
xp, yp = wrf_to_lon_lat(lons, lats, xp.astype(int), yp.astype(int))
view.plot(xp, yp, 'k-', lw=0.5, transform=from_proj)
view.set_title("Subset: " + subset + " Dataset: " + dataset)
In [5]:
import cartopy
import cartopy.crs as ccrs
from mcs_future.utils.mapping_help import wrf_to_lon_lat
import matplotlib.pyplot as plt
from netCDF4 import Dataset
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 10
from_proj = ccrs.PlateCarree()
to_proj = ccrs.AlbersEqualArea(central_longitude=-95, central_latitude=38.0000)
view = generate_view(-120, -73, 20, 50, from_proj, to_proj)
subset = 'mcs'
dataset = 'CTRL'
df_ = data[subset][dataset]
df_ = df_[df_.index.month.isin([6,7,8])]
for sid, swath in df_.groupby('storm_num'):
xp = np.array([np.mean([x1, x2]) for (x1, x2) in zip(swath.xmin.values, swath.xmax.values)])
yp = np.array([np.mean([y1, y2]) for (y1, y2) in zip(swath.ymin.values, swath.ymax.values)])
if dataset == 'OBS':
xp, yp = NOWrad_to_lon_lat(xp, yp)
else:
xp, yp = wrf_to_lon_lat(lons, lats, xp.astype(int), yp.astype(int))
view.plot(xp, yp, 'k-', lw=0.5, transform=from_proj)
view.set_title("Subset: " + subset + " Dataset: " + dataset)
In [6]:
import sys
import geopandas as gpd
from mcs_future.utils.mapping_help import get_point_subset
mw_shp_dir = "../data/shapefiles/map/"
wrf_dir = "../data/wrf_data/"
wrf_data_file = "RALconus4km_wrf_constants.nc"
mw_data = {'mcs':{'OBS':None, 'CTRL':None, 'PGW':None},
'qlcs':{'OBS':None, 'CTRL':None, 'PGW':None},
'non_qlcs':{'OBS':None, 'CTRL':None, 'PGW':None}}
outline = gpd.read_file(mw_shp_dir + "midwest_outline_latlon_grids.shp")
for sub_key, sub_value in data.items():
for dset_key, dset_val in sub_value.items():
mw_data[sub_key][dset_key] = get_point_subset(dset_val.copy(),
outline, wrf_dir + wrf_data_file)
In [7]:
view = generate_view(-120, -73, 20, 50, from_proj, to_proj)
subset = 'qlcs'
dataset = 'OBS'
df_ = mw_data[subset][dataset].copy()
df_ = df_[df_.index.month.isin([6,7,8])]
for sid, swath in df_.groupby('storm_num'):
xp = np.array([np.mean([x1, x2]) for (x1, x2) in zip(swath.xmin.values, swath.xmax.values)])
yp = np.array([np.mean([y1, y2]) for (y1, y2) in zip(swath.ymin.values, swath.ymax.values)])
xp, yp = wrf_to_lon_lat(lons, lats, xp.astype(int), yp.astype(int))
view.plot(xp, yp, 'k-', lw=0.5, transform=from_proj)
view.set_title("Midwest -- Subset: " + subset + " Dataset: " + dataset)
draw_midwest(view)
In [8]:
view = generate_view(-120, -73, 20, 50, from_proj, to_proj)
subset = 'qlcs'
dataset = 'CTRL'
df_ = mw_data[subset][dataset]
df_ = df_[df_.index.month.isin([6,7,8])]
for sid, swath in df_.groupby('storm_num'):
xp = np.array([np.mean([x1, x2]) for (x1, x2) in zip(swath.xmin.values, swath.xmax.values)])
yp = np.array([np.mean([y1, y2]) for (y1, y2) in zip(swath.ymin.values, swath.ymax.values)])
if dataset == 'OBS':
xp, yp = NOWrad_to_lon_lat(xp, yp)
else:
xp, yp = wrf_to_lon_lat(lons, lats, xp.astype(int), yp.astype(int))
view.plot(xp, yp, 'k-', lw=0.5, transform=from_proj)
view.set_title("Midwest -- Subset: " + subset + " Dataset: " + dataset)
draw_midwest(view)
In [9]:
import datetime
from scipy.misc import imread
def calc_day_count(dframe):
#Resample and calculate how many rows each day has
days = dframe.resample('D').count()
#Give me the count of days with at least one row
return np.sum(days['storm_num'].values > 0)
In [233]:
#Read in all observed qlcs events
dataset = 'OBS'
subset = 'qlcs'
#Read in a copy of this dataset. This is important because
#we will be modifying the index (a datetime) to simplify the
#Day Count calculation
a = mw_data[subset][dataset].copy()
#Add 6 hours to the datetime so 18z is now centered at 00z
a.index = a.index + datetime.timedelta(hours=6)
#set year, month, and range of days
year = 2010
month = 6
#look at
day_range = np.array(list(range(10, 16)))
#make temporary month/year dataframe
b = a[(a.index.year==year) & ((a.index.month==month)) & (a.index.day.isin(day_range))]
#try to select 8 or fewer days or this won't work that well
colors = ['b', 'c', 'g', 'k', 'm', 'r', 'grey', 'y']
lines = []
labels = []
#set up view centered on midwest
view = generate_view(-105, -80, 30, 50, from_proj, to_proj)
#plot tracks by day
for color, day in zip(colors, day_range):
df_ = b[b.index.day==day]
for sid, sw in df_.groupby('storm_num'):
if len(sw) > 0:
xp = np.array([np.mean([x1, x2]) for (x1, x2) in zip(sw.xmin.values, sw.xmax.values)])
yp = np.array([np.mean([y1, y2]) for (y1, y2) in zip(sw.ymin.values, sw.ymax.values)])
xp, yp = wrf_to_lon_lat(lons, lats, xp.astype(int), yp.astype(int))
view.plot(xp, yp, 'x', markersize=15, color=color, transform=from_proj)
view.plot(xp, yp, '-', lw=3, color=color, transform=from_proj,
label=subset.upper() + " " + sid + " " + str(year) + "-" + str(month).zfill(2) + "-" + str(day).zfill(2))
view.set_title(subset.upper() + " tracks (colored by day)")
draw_midwest(view)
c = a[(a.index.year==year) & (a.index.month==month) & (a.index.day.isin(day_range))]
print(subset + " day counts:", calc_day_count(c), "; percent of days examined:", calc_day_count(c) / len(day_range))
print(subset + " swath counts:", c.storm_num.nunique())
view.legend(loc=3)
Out[233]:
In [199]:
import numpy as np
for sub_key in ['mcs', 'qlcs', 'non_qlcs']:
for dset_key in ['OBS', 'CTRL', 'PGW']:
#copy the desired dataset and subset
dset_val = mw_data[sub_key][dset_key].copy()
#Get running count of slices per storm
dset_val['storm_run'] = dset_val.groupby(['storm_num']).cumcount()
#Shift 18z to 00z for analysis purposes
dset_val.index = dset_val.index + datetime.timedelta(hours=6)
total_count = calc_day_count(dset_val)
seasons = {'djf':(12, 1, 2), 'mam':(3, 4, 5), 'jja':(6, 7, 8), 'son':(9, 10, 11)}
for season_id, season_vals in seasons.items():
dset_ = dset_val[dset_val.index.month.isin(season_vals)].copy()
perc = "{0:.2f}".format(100*(calc_day_count(dset_) / total_count))
seasons[season_id] = perc
print(sub_key, dset_key, "Total Count:", total_count,
", % DJF", seasons['djf'], ', % MAM', seasons['mam'],
', % JJA', seasons['jja'], ', % SON', seasons['son'], '\n')
In [12]:
from calendar import month_name as mname
col_names = ['key'] + [str(x) for x in range(1, 13)]
months = {x:[] for x in col_names}
for sub_key in ['mcs', 'qlcs', 'non_qlcs']:
for dset_key in ['OBS', 'CTRL', 'PGW']:
dset_val = mw_data[sub_key][dset_key].copy()
#Shift 18z to 00z for analysis purposes
dset_val.index = dset_val.index + datetime.timedelta(hours=6)
#Total count of event days in each subset / dataset key pair
total_count = calc_day_count(dset_val)
for month in list(range(1, 13)):
#only data from the particular month
dset_ = dset_val[dset_val.index.month==month].copy()
#append the percentage to the appropriate dictionary location
months[str(month)].append(100*(calc_day_count(dset_) / total_count))
months['key'].append(sub_key + "_" + dset_key)
df = pd.DataFrame.from_dict(months)
df = df.set_index('key')
for gid, group in df.groupby(df.index):
max_idx = mname[int(group.idxmax(axis=1).values[0])]
min_idx = mname[int(group.idxmin(axis=1).values[0])]
max_val = group.max(axis=1).values[0]
min_val = group.min(axis=1).values[0]
print(gid, "\nMax Month:", max_idx, "\nMax Percentage:", "{:0.2f}".format(max_val), "%",
"\nMin Month:", min_idx, "\nMin Percentage:", "{:0.2f}".format(min_val), "%",
"\nMax Percentage Diff", "{:0.2f}".format(max_val - min_val), "%\n")
df[col_names[1:]]
Out[12]:
In [191]:
import datetime
from pandas.plotting import register_matplotlib_converters as pdtc
import matplotlib.dates as mdates
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
pdtc()
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 15, 30
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['axes.labelsize'] = 20
month_doy = {'Jan':1, 'Feb':32, 'Mar':60,
'Apr':91, 'May':121, 'Jun':152,
'Jul':182, 'Aug':213, 'Sep':244,
'Oct':274, 'Nov':305, 'Dec':335}
ax = plt.subplot(1, 1, 1)
drange = pd.date_range(start=datetime.datetime(2000, 10, 1), end=datetime.datetime(2013, 12, 31), freq='D')
colors = [plt.cm.plasma(x/3) for x in range(0, 3)]
def draw_cumu_labels(ax):
ax.legend(loc=4, prop={'size': 20})
ax.axvspan(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 3, 1), facecolor='0.5', alpha=0.1)
ax.axvspan(datetime.datetime(2012, 6, 1), datetime.datetime(2012, 9, 1), facecolor='0.5', alpha=0.1)
ax.axvspan(datetime.datetime(2012, 12, 1), datetime.datetime(2012, 12, 31), facecolor='0.5', alpha=0.1)
ax.grid()
ax.set_ylabel("Cumulative Count")
return ax
aug_obs = 0
may_obs = 0
feb_obs = 0
aug_ctrl = 0
may_ctrl = 0
feb_ctrl = 0
for subplot, subset in enumerate(['mcs', 'qlcs', 'non_qlcs']):
ctrl_ = None
obs_ = None
maxyear = 0
for dset_num, dataset in enumerate(['OBS', 'CTRL', 'PGW']):
ax = plt.subplot(3, 1, subplot+1)
#Get run and dataset from Midwest subset
df_ = mw_data[subset][dataset].copy()
df_.index = df_.index + datetime.timedelta(hours=6)
df_ = df_[~((df_.index.year == 2000) | (df_.index.year == 2013))]
years = df_.groupby(df_.index.year)
dset = []
for yid, year in years:
a = year.resample('D').count()
rng = pd.DataFrame(index=pd.date_range(start=str(yid) + '-01-01', end=str(yid) + '-12-31', freq='1D'))
rng = rng.join(a[['storm_num']])
rng = rng.fillna(0)
rng['storm_num'] = 1*(rng['storm_num'].values>0)
dset.append(rng.cumsum())
dset = pd.concat(dset)
dset['doy'] = dset.index.dayofyear
p25 = []
p75 = []
mean = []
for did, doy in dset.groupby('doy'):
p25.append(np.percentile(doy.storm_num, 25, interpolation='linear'))
p75.append(np.percentile(doy.storm_num, 75, interpolation='linear'))
mean.append(np.mean(doy.storm_num))
mean[-1] = mean[-2]
p25[-1] = p25[-2]
p75[-1] = p75[-2]
ax.fill_between(rng.index.to_pydatetime(), p25, p75, color=colors[dset_num], alpha=0.2)
ax.plot(rng.index.to_pydatetime(), mean, '-', color=colors[dset_num], linewidth=4, label=dataset)
draw_cumu_labels(ax)
if dataset == 'OBS':
aug_obs = dset[(dset.index.month==8) & (dset.index.day==31)]['storm_num'].mean()
feb_obs = []
for year in range(2001, 2013):
if year not in [2004, 2008, 2012]:
feb_obs.append(dset[(dset.index.year==year) &
(dset.index.month==2) &
(dset.index.day==28)]['storm_num'].values[0])
else:
feb_obs.append(dset[(dset.index.year==year) &
(dset.index.month==2) &
(dset.index.day==29)]['storm_num'].values[0])
feb_obs = np.mean(feb_obs)
may_obs = dset[(dset.index.month==5) & (dset.index.day==31)]['storm_num'].mean()
elif dataset == 'CTRL':
aug_ctrl = dset[(dset.index.month==8) & (dset.index.day==31)]['storm_num'].mean()
feb_ctrl = []
for year in range(2001, 2013):
if year not in [2004, 2008, 2012]:
feb_ctrl.append(dset[(dset.index.year==year) &
(dset.index.month==2) &
(dset.index.day==28)]['storm_num'].values[0])
else:
feb_ctrl.append(dset[(dset.index.year==year) &
(dset.index.month==2) &
(dset.index.day==29)]['storm_num'].values[0])
feb_ctrl = np.mean(feb_ctrl)
may_ctrl = dset[(dset.index.month==5) & (dset.index.day==31)]['storm_num'].mean()
jan_sep_dif_obs = aug_obs
mar_may_dif_obs = may_obs - feb_obs
aug_jun_dif_obs = aug_obs - may_obs
jan_sep_dif_ctrl = aug_ctrl
mar_may_dif_ctrl = may_ctrl - feb_ctrl
aug_jun_dif_ctrl = aug_ctrl - may_ctrl
print("JJA OBS: ", subset, aug_jun_dif_obs)
print("JJA CTRL: ", subset, aug_jun_dif_ctrl)
print("Jan-Sept OBS - CTRL Diff for:", subset, jan_sep_dif_obs - jan_sep_dif_ctrl)
print("MAM OBS - CTRL Diff for:", subset, mar_may_dif_obs - mar_may_dif_ctrl)
print("JJA OBS - CTRL Diff for:", subset, aug_jun_dif_obs - aug_jun_dif_ctrl)
ax.set_xlim(rng.index[0], rng.index[-1])
ax.set_ylabel("Cumulative " + subset.upper() + " Days", fontsize=20)
locator = mdates.MonthLocator()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.savefig("Figure1.tif", dpi=400, bbox_inches='tight')
In [16]:
col_names = ['key'] + [str(x) for x in range(0, 24)]
hours = {x:[] for x in col_names}
for sub_key in ['mcs', 'qlcs', 'non_qlcs']:
for dset_key in ['OBS', 'CTRL', 'PGW']:
dset_val = mw_data[sub_key][dset_key].copy()
dset_val = dset_val[dset_val.index.month.isin([6,7,8])]
total_count = 1196
for hr in list(range(0, 24)):
dset_ = dset_val[dset_val.index.hour==hr]
hours[str(hr)].append(100*(calc_day_count(dset_) / total_count))
hours['key'].append(sub_key + "_" + dset_key)
df = pd.DataFrame.from_dict(hours)
df = df.set_index('key')
for gid, group in df.groupby(df.index):
max_idx = group.idxmax(axis=1).values[0]
min_idx = group.idxmin(axis=1).values[0]
max_val = group.max(axis=1).values[0]
min_val = group.min(axis=1).values[0]
print(gid, "\nMax:", max_idx.zfill(2), "z - Max %", max_val,
"\nMin:", min_idx.zfill(2), "z - Min %", min_val, '\nMax % Diff', max_val - min_val, '\n')
df
Out[16]:
In [198]:
shrs = [str(x) for x in range(0, 24)]
col_names = ['subset', 'dset'] + shrs
hours = {x:[] for x in col_names}
for sub_key in ['mcs', 'qlcs', 'non_qlcs']:
for dset_key in ['OBS', 'CTRL', 'PGW']:
dset_val = mw_data[sub_key][dset_key].copy()
dset_val = dset_val[dset_val.index.month.isin([6,7,8])].copy()
total_count = 1196
for hr in list(range(0, 24)):
dset_ = dset_val[dset_val.index.hour==hr].copy()
hours[str(hr)].append(calc_day_count(dset_))
hours['subset'].append(sub_key)
hours['dset'].append(dset_key)
df = pd.DataFrame.from_dict(hours)
for sset in ['mcs', 'qlcs', 'non_qlcs']:
obs = df[(df.dset=='OBS') & (df.subset==sset)]
ctrl = df[(df.dset=='CTRL') & (df.subset==sset)]
diff = np.abs(obs[shrs].values - ctrl[shrs].values)
max_idx = np.argmax(diff)
min_idx = np.argmin(diff)
max_val = np.max(diff) / 13
min_val = np.min(diff) / 13
print(sset, "\nMax:", max_idx, "z - Max ", max_val,
"\nMin:", min_idx, "z - Min ", min_val)
print("\nTotals")
print("\n", df[col_names[:10]])
print("\n", df[col_names[:2] + col_names[10:18]])
print("\n", df[col_names[:2] + col_names[18:]])
In [20]:
plt.rcParams['figure.figsize'] = 15, 30
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['axes.labelsize'] = 20
def draw_hour_labels(ax, lab):
ax.set_xlim(0, 23)
ax.legend(prop={'size': 20})
ax.set_ylabel(lab + " Days Per Hour")
ax.set_xlabel("Hour of Day (UTC)")
ax.set_xticks(list(range(0, 24)))
#ax.set_title(lab + " Swath Count by Hour of Day", fontsize=20)
ax.grid()
colors = [plt.cm.plasma(x/3) for x in range(0, 3)]
for subplot, subset in enumerate(['mcs', 'qlcs', 'non_qlcs']):
for dset_num, dataset in enumerate(['OBS', 'CTRL', 'PGW']):
ax = plt.subplot(3, 1, subplot+1)
df_ = mw_data[subset][dataset].copy()
df_ = df_[df_.index.month.isin([6,7,8])].copy()
years = df_.groupby(df_.index.year)
year_data = np.zeros((13, 24), dtype=int)
for yid, year in years:
hour_data = []
for hid in range(0, 24):
hour = year[year.index.hour==hid]
hour_data.append(calc_day_count(hour))
year_data[yid-2001, :] = np.array(hour_data)
p25 = np.percentile(year_data, 25, axis=0)
p75 = np.percentile(year_data, 75, axis=0)
plt.fill_between(list(range(24)), p25, p75, color=colors[dset_num], alpha=0.2)
plt.plot(np.mean(year_data, axis=0), '-', color=colors[dset_num], linewidth=4, label=dataset)
ax = draw_hour_labels(ax, subset.upper())
plt.savefig("Figure2.tif", dpi=400, bbox_inches='tight')