I pulled the data from the USGS Advanced National Seismic System Comprehensive Earthquake Catalog.
In [ ]:
import datetime as dt
import graphlab as gl
sf = gl.SFrame.read_csv('raw_data/global_earthquakes.csv', verbose=False)
In [ ]:
sf.show()
In [ ]:
useful_columns = ['time', 'latitude', 'longitude', 'mag', 'type', 'location']
sf = sf[useful_columns]
In [ ]:
mask = sf['type'] == 'nuclear explosion'
sf[mask]
In [ ]:
mask = sf['type'] == 'earthquake'
sf = sf[mask]
sf = sf.remove_column('type')
print "Number of earthquake events:", sf.num_rows()
In [ ]:
sf['time'] = sf['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')
sf['time'] = sf['time'].apply(lambda x: x.replace(tzinfo=None))
In [ ]:
quakes = gl.TimeSeries(sf, index='time')
In [ ]:
quakes.print_rows(3)
In [ ]:
quakes[4:7].print_rows()
In [ ]:
quakes[['latitude', 'longitude']].print_rows(3)
In [ ]:
start = dt.datetime(2014, 5, 1)
end = dt.datetime(2014, 5, 2)
quakes.slice(start, end).print_rows(3)
In [ ]:
print "Earliest timestamp:", quakes.min_time
print "Latest timestamp:", quakes.max_time
print "Timestamp range:", quakes.range
In [ ]:
print "Index column:", quakes.index_col_name
print "Value columns:", quakes.value_col_names
In [ ]:
print quakes.index[:3]
In [ ]:
big_one = quakes.argmax('mag')
quakes[big_one]
In [ ]:
sf2 = quakes.to_sframe()
print type(sf2)
In [ ]:
sf_recent = gl.SFrame.read_csv('raw_data/global_earthquakes_recent.csv', verbose=False)
# Trim away the columns we're not interested in.
sf_recent = sf_recent[useful_columns]
# Remove any non-earthquake events.
mask = sf_recent['type'] == 'earthquake'
sf_recent = sf_recent[mask]
sf_recent = sf_recent.remove_column('type')
# Convert the timestamp to a `datetime` type.
sf_recent['time'] = sf_recent['time'].str_to_datetime(str_format='%Y-%m-%dT%H:%M:%s%ZP')
sf_recent['time'] = sf_recent['time'].apply(lambda x: x.replace(tzinfo=None))
# Convert to a `TimeSeries` object.
recent_quakes = gl.TimeSeries(sf_recent, index='time')
recent_quakes.print_rows(3)
SFrame.append.TimeSeries.union enforces time order.
In [ ]:
all_quakes = quakes.union(recent_quakes)
print all_quakes.min_time
print all_quakes.max_time
In [ ]:
grp = quakes.group('location')
print grp
The group_info SFrame tells us what the group names are and how many observations are in each group.
In [ ]:
grp.group_info().topk('group_size', k=8)
The get_group method lets us isolate just the observations for any group.
In [ ]:
oklahoma_quakes = grp.get_group('Oklahoma')
oklahoma_quakes.print_rows(3)
The date_part of a TimeSeries object let's us specify components of a datetime value.
In [ ]:
grp = quakes.group(quakes.date_part.HOUR)
hour_counts = grp.group_info()
hour_counts.print_rows(5)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.bar(hour_counts['time.hour'], hour_counts['group_size'], color='dodgerblue')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Number of earthquakes')
fig.show()
Four things happen with the resample method:
In [ ]:
import graphlab.aggregate as agg
daily_stats = quakes.resample(period=dt.timedelta(days=1),
upsample_method='none',
downsample_method={'count': agg.COUNT('latitude'),
'avg_mag': agg.MEAN('mag'),
'max_mag': agg.MAX('mag')})
daily_stats['count'] = daily_stats['count'].fillna(0)
daily_stats.print_rows(5)
In [ ]:
def compute_daily_stats(data):
daily = data.resample(period=dt.timedelta(days=1),
upsample_method='none',
downsample_method={'count': agg.COUNT('latitude'),
'avg_mag': agg.MEAN('mag'),
'max_mag': agg.MAX('mag')})
daily['count'] = daily['count'].fillna(0)
return daily
In [ ]:
# Save the daily counts and recent daily counts.
daily_stats.save('working_data/global_daily_stats.ts')
compute_daily_stats(recent_quakes).save('working_data/global_daily_update.ts')
In [ ]:
# Filter just the Oklahoma data from the recent events.
grp = recent_quakes.group('location')
recent_oklahoma_quakes = grp.get_group('Oklahoma')
In [ ]:
# Compute daily stats for the Oklahoma quake events.
compute_daily_stats(oklahoma_quakes).save('working_data/ok_daily_stats.ts')
compute_daily_stats(recent_oklahoma_quakes).save('working_data/ok_daily_update.ts')