In [1]:
from distributed import Executor, hdfs, progress, wait, s3
e = Executor('localhost:8786')
e


Out[1]:
<Executor: scheduler=localhost:8786 workers=16 threads=16>

In [2]:
df = s3.read_csv('s3://blaze-data/gdelt/csv/201401*.export.csv', sep='\t', header=None)


Setting global dask scheduler to use distributed

In [3]:
df = e.persist(df)

In [4]:
progress(df)

In [5]:
df.head(5)


Out[5]:
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57
0 281451675 20040104 200401 2004 2004.011 NaN NaN NaN NaN NaN ... GM 1 Germany GM GM 51.00000 9.000 GM 20140101 http://www.bankinfosecurity.com/nsa-reacts-to-...
1 281451676 20040104 200401 2004 2004.011 IDN INDONESIA IDN NaN NaN ... NaN 4 Jakarta, Jakarta Raya, Indonesia ID ID04 -6.17444 106.829 -2679652 20140101 http://www.gulf-times.com/asean-philippines/18...
2 281451677 20040104 200401 2004 2004.011 IDN INDONESIA IDN NaN NaN ... NaN 4 Bali, Jawa Timur, Indonesia ID ID08 -7.10460 112.337 10205777 20140101 http://www.themalaymailonline.com/world/articl...
3 281451678 20040104 200401 2004 2004.011 IDN INDONESIA IDN NaN NaN ... 10205777 4 Jakarta, Jakarta Raya, Indonesia ID ID04 -6.17444 106.829 -2679652 20140101 http://www.skynews.com.au/world/article.aspx?i...
4 281451679 20040104 200401 2004 2004.011 IDN INDONESIA IDN NaN NaN ... AS 4 Jakarta, Jakarta Raya, Indonesia ID ID04 -6.17444 106.829 -2679652 20140101 http://www.skynews.com.au/world/article.aspx?i...

5 rows × 58 columns


In [6]:
gts = df[[1, 26, 0, 51, 3, 53, 54]]

In [7]:
gts.columns = ['Date', 'Code', 'ID', 'Country', 'Year', 'Latitude', 'Longitude']

In [8]:
gts.head()


Out[8]:
Date Code ID Country Year Latitude Longitude
0 20040104 30 281451675 GM 2004 51.00000 9.000
1 20040104 190 281451676 ID 2004 -6.17444 106.829
2 20040104 190 281451677 ID 2004 -7.10460 112.337
3 20040104 190 281451678 ID 2004 -6.17444 106.829
4 20040104 190 281451679 ID 2004 -6.17444 106.829

In [9]:
gts = gts[gts['Year'] == 2014]

In [10]:
gts.head()


Out[10]:
Date Code ID Country Year Latitude Longitude
2246 20140101 10 281454155 AF 2014 34.5167 69.1833
2247 20140101 10 281454156 TS 2014 34.0000 9.0000
2248 20140101 20 281454157 AF 2014 34.5167 69.1833
2249 20140101 30 281454158 AF 2014 34.5167 69.1833
2250 20140101 36 281454159 AF 2014 33.0000 65.0000

In [11]:
event_codes = [211, 231, 311, 331, 61, 71]

In [12]:
gts = gts[gts['Code'].isin(event_codes)]

In [13]:
gts.head()


Out[13]:
Date Code ID Country Year Latitude Longitude
2334 20140101 61 281454243 GR 2014 39.0000 22.0000
2335 20140101 61 281454244 US 2014 38.5111 -96.8005
2339 20140101 71 281454248 AL 2014 41.0000 20.0000
2447 20140101 61 281454356 CH 2014 39.9289 116.3880
2448 20140101 71 281454357 AS 2014 -27.0000 133.0000

In [14]:
gts = gts[gts['Country'] == 'US']

In [15]:
gts.head()


Out[15]:
Date Code ID Country Year Latitude Longitude
2335 20140101 61 281454244 US 2014 38.5111 -96.8005
2522 20140101 61 281454431 US 2014 0.0000 0.0000
2524 20140101 61 281454433 US 2014 0.0000 0.0000
2626 20140101 311 281454535 US 2014 38.4199 -117.1220
2857 20140101 231 281454766 US 2014 38.0000 -97.0000

In [16]:
import numpy as np
lat = np.array(gts.Latitude)
lon = np.array(gts.Longitude)

In [17]:
from bokeh.io import output_notebook, output_file, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)

In [18]:
map_options = GMapOptions(lat=30.29, lng=-97.73, map_type="roadmap", zoom=11)

plot = GMapPlot(
    x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options, title="Austin"
)

In [19]:
source = ColumnDataSource(
    data=dict(
        lat=lat,
        lon=lon,
    )
)

In [20]:
circle = Circle(x="lon", y="lat", size=15, fill_color="blue", fill_alpha=0.8, line_color=None)
plot.add_glyph(source, circle)


Out[20]:
<bokeh.models.renderers.GlyphRenderer at 0x7f6cb5a61090>

In [21]:
# output_file('map_plot.html')
output_notebook()


BokehJS successfully loaded.

In [22]:
plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())

In [23]:
show(plot)


Out[23]:
<bokeh.io._CommsHandle at 0x7f6cb69a8bd0>