In [26]:
# Kabul is RC Capital
from IPython.display import Image
Image(filename='data/map.jpg')
Out[26]:
In [27]:
# Pandas contains useful functions for data structures with "relational" or "labeled" data
import pandas
# header as suggested
# by WikiLeaks: https://wikileaks.org/afg/
# by the Guardian: http://www.theguardian.com/world/datablog/2010/jul/25/wikileaks-afghanistan-data
header = [
'ReportKey', # find messages and also to reference them
'DateOccurred', 'EventType',
'Category', # describes what kind of event the message is about
'TrackingNumber', 'Title', # internal tracking number and title
'Summary', # actual description of the event
'Region', # broader region of the event, RC = regional command
'AttackOn', # who was attacked during an event
'ComplexAttack', # signifies that an attack was a larger operation that required more planning, coordination and preparatio
'ReportingUnit', 'UnitName', 'TypeOfUnit', # information on the military unit that authored the report
'FriendlyWounded', 'FriendlyKilled', 'HostNationWounded', 'HostNationKilled', 'CivilianWounded', 'CivilianKilled',
'EnemyWounded', 'EnemyKilled', 'EnemyDetained', # who was killed/wounded/captured
'MilitaryGridReferenceSystem', 'Latitude', 'Longitude', # location
'OriginatorGroup', 'UpdatedByGroup', # message originated from or was updated by
'CommandersCriticalInformationRequirements',
'Significant', # are analyzed and evaluated by special group in command centre
'Affiliation', # event was of friendly, neutral or enemy nature
'DisplayColor', # enemy activity - RED, friendly activity - BLUE, afghan/neutral activity (accidents, drugs etc.) - GREEN
'ClassificationLevel' # classification level of the message, e.g.: Secret
]
data = pandas.read_csv('data/afg.csv', header=None, names=header)
# lower case some columns see problems: https://wardiaries.wikileaks.org/search/?sort=date
data['Category'] = data['Category'].str.lower()
data['Title'] = data['Title'].str.lower()
data.head()
Out[27]:
Extract a glossary of military terms from the Guardian: http://www.theguardian.com/world/datablog/2010/jul/25/wikileaks-afghanistan-war-logs-glossary
In [28]:
# generate the glossary
import bs4, lxml, re, requests
link = 'http://www.theguardian.com/world/datablog/2010/jul/25/wikileaks-afghanistan-war-logs-glossary'
response = requests.get(link)
try:
if not response.ok:
print 'HTTP error {} trying to fetch Guradian glossary: {}'.format(response.status_code, link)
else:
glossary = dict()
soup = bs4.BeautifulSoup(response.content, 'lxml')
glossary_table = soup.find('table')
for row in glossary_table.find_all('tr'):
cells = row.find_all("td")
if len(cells) == 2:
if cells[0].string:
key = str(cells[0].string.strip().lower())
content = cells[1].text
glossary[key] = content
except requests.exceptions.ConnectionError as e:
'Connection error {} on {}'.format(e, link)
print glossary['afg']
In [29]:
data['DateOccurred'] = pandas.to_datetime(data['DateOccurred'])
data['Year'] = [date.year for date in data['DateOccurred']]
data['Hour'] = [date.hour for date in data['DateOccurred']]
#Number of rows/columns
print "Number of rows: %d" % data.shape[0]
print "Number of columns: %d" % data.shape[1]
date_range = set()
for date in data['DateOccurred']:
date_range.add(date.year)
print "\nYears:\n"
print list(date_range)
#Ocurrences of categories
print "\nNumber of unique categories: %d" %len(set(data['Category']))
#Distribution of categoriesn_occurrences[0:20]
n_occurrences = data['Category'].value_counts()
print "\nMost commonly occurring categories of crime:\n"
print n_occurrences.head()
print "\nMost commonly occurring category of crime is %s with %d" % (n_occurrences.argmax(), n_occurrences.max())
print "\nLeast commonly occurring category of crime is %s with %d" % (n_occurrences.argmin(), n_occurrences.min())
In [30]:
# plot distribution of categories (TOP 50)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
n_occurrences_top = n_occurrences[0:50]
#plot histogram
def barplot(series, title, figsize, ylabel, flag, rotation):
#plot hist
ax = series.plot(kind='bar',
title = title,
figsize = figsize,
fontsize = 13)
# set ylabel
ax.set_ylabel(ylabel)
# set xlabel (depending on the flag that comes as a function parameter)
ax.get_xaxis().set_visible(flag)
# set series index as xlabels and rotate them
ax.set_xticklabels(series.index, rotation= rotation)
barplot(n_occurrences_top,'Category occurrences', figsize=(14,6), ylabel = 'category count',flag = True, rotation = 90)
In [31]:
focus_categories = n_occurrences.index[0:8]
print focus_categories
In [32]:
def yearly_category_distribution(data, focus_categories):
index = 1
for category in focus_categories:
#filter table by type of category
db = data[data['Category'] == category]
#get year counts of that crime
year_counts = db['Year'].value_counts()
#sort it (from 2004 to 2009)
year_counts = year_counts.sort_index()
#plot it
plt.subplot(7,2,index)
barplot(year_counts, category, figsize=(20,35), ylabel = 'category count', flag = True, rotation = 0)
index += 1
yearly_category_distribution(data, focus_categories)
In [33]:
def hourly_category_distribution(data, focus_categories):
index = 1
for category in focus_categories:
#filter table by type of category
db = data[data['Category'] == category]
#get year counts of that crime
hour_counts = db['Hour'].value_counts()
#sort it (from 2004 to 2009)
hour_counts = hour_counts.sort_index()
#plot it
plt.subplot(7,2,index)
barplot(hour_counts, category, figsize=(20,35), ylabel = 'category count', flag = True, rotation = 0)
index += 1
hourly_category_distribution(data, focus_categories)
Casualties and wounded recorded per year. A bar plot with one bar for each of the categories:
(MACHINE LEARNING: KNN)
One Map of Afghanistan with incidents (color scheme DisplayColor: enemy activity - RED, friendly activity - BLUE, friend on friend - GREEN).
(MACHINE LEARNING: Cluster K-Means)
Map with 5 regions and choose one category and cluster all incidents during the war, see how many died at specific incident.
(MACHINE LEARNING: Decision Tree/Random Forest)
x-axis: morning=6-10, midday=11-14, afternoon=15-17, evening=18-23 y-axis: prediction score of each category in scatterplot, bubble size according to all incidents in that area
EXTRA:
Word clouds for categories or summary.
An example of all enemy attacks below.
In [34]:
import geoplotlib
from geoplotlib.utils import BoundingBox
def geo_plot(geodata):
"""
Plot given coordinate input
"""
# bounding box on the minima and maxima of the data
geoplotlib.set_bbox(
BoundingBox(
max(geodata['lat']),
max(geodata['lon']),
min(geodata['lat']),
min(geodata['lon'])
));
# kernel density estimation visualization
geoplotlib.kde(geodata, bw=5, cut_below=1e-3, cmap='hot', alpha=170)
# google tiles with lyrs=y ... hybrid
geoplotlib.tiles_provider({
'url': lambda zoom, xtile, ytile: 'https://mt1.google.com/vt/lyrs=y&hl=en&x=%d&y=%d&z=%d' % (xtile, ytile, zoom ),
'tiles_dir': 'DTU-social_data',
'attribution': 'DTU 02806 Social Data Analysis and Visualization'
})
# only enemy activity
include = (data.Latitude < 38) & (data.Latitude > 30) & (data.Longitude > 55) &(data.Longitude < 75) & (data.DisplayColor == 'RED')
# index geodata
geodata = {
"lat": data.loc[include].Latitude.tolist(),
"lon": data.loc[include].Longitude.tolist()
}
geo_plot(geodata)
geoplotlib.inline();
In [35]:
print len(data[data.ClassificationLevel.isin(['secret'])])
In [ ]: