In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [7]:
# Import the excel file and call it xls_file
xls_file = pd.ExcelFile('globalterrorismdb_0616dist.xlsx')
xls_file
Out[7]:
In [5]:
# Load the xls file's Sheet1 as a dataframe
dfwhole = xls_file.parse('Data')
dfwhole
Out[5]:
In [6]:
## Now trying to make the df a csv file
dfwhole.to_csv('gtd1970_2015.csv', encoding = 'utf8')
In [7]:
len(dfwhole)
Out[7]:
In [8]:
#Checking column counts
dfwhole.count()
Out[8]:
In [176]:
#
list(dfwhole)
Out[176]:
In [163]:
dfwhole['country_txt'].value_counts()
## thinking about looking at MENA and Asia region with 4 of the top 5 countries listed are in those regions.
Out[163]:
In [109]:
dfwhole.head(2)
Out[109]:
In [73]:
df_prior = dfwhole[(dfwhole.attacktype1 == 3) & (dfwhole.region == 6) & (dfwhole.iyear <= 2000)].
groupby(['iyear', 'country_txt']).attacktype1.count().values
In [71]:
## Confirming it worked.
df_prior
Out[71]:
In [77]:
df_prior_mean = df_prior.mean()
print df_prior_mean
In [65]:
df_prior.groupby('country_txt').attacktype1.count()
Out[65]:
In [117]:
dfwhole.groupby(['iyear', 'country_txt']).attacktype1.count()
Out[117]:
In [78]:
df_prior_std = df_prior.std()
print df_prior_std
In [92]:
mean_prior_mean = df_prior_mean
mean_prior_std = df_prior_std
In [82]:
afp = dfwhole[(dfwhole.attacktype1 == 3) & (dfwhole.country == 4) & (dfwhole.iyear > 2000)].
groupby(['iyear']).attacktype1.count().values
In [84]:
afp
Out[84]:
In [86]:
afp_mean = afp.mean()
print afp_mean
In [87]:
afp_std = afp.std()
print afp_std
In [88]:
pakp = dfwhole[(dfwhole.attacktype1 == 3) & (dfwhole.country == 153) & (dfwhole.iyear > 2000)].
groupby(['iyear']).attacktype1.count().values
In [89]:
pakp_mean = pakp.mean()
print pakp_mean
In [90]:
## setting up std for Pakistan
pakp_std = pakp.std()
print pakp_std
In [95]:
with pm.Model() as model:
groupPk_mean = pm.Normal('Bombings_Pak_mean', mean_prior_mean, sd=mean_prior_std)
groupAfg_mean = pm.Normal('Bombings_Afg_mean', mean_prior_mean, sd=mean_prior_std)
In [96]:
std_prior_lower = 0.01
std_prior_upper = 100.0
with model:
groupPak_std = pm.Uniform('Bombings_Pak_std', lower=std_prior_lower, upper=std_prior_upper)
groupAfg_std = pm.Uniform('Bombings_Afg_std', lower=std_prior_lower, upper=std_prior_upper)
In [97]:
with model:
groupPak = pm.Normal('Bombings_Pak', mu=groupPk_mean, sd=groupPak_std, observed=pakp)
groupAfg = pm.Normal('Bombings_Afg', mu=groupAfg_mean, sd=groupAfg_std, observed=afp)
In [98]:
with model:
diff_of_means = pm.Deterministic('difference of means',groupPk_mean - groupAfg_mean)
diff_of_stds = pm.Deterministic('difference of stds',groupPak_std - groupAfg_std)
effect_size = pm.Deterministic('effect size',
diff_of_means / np.sqrt((groupPak_std**2 + groupAfg_std**2) / 2))
In [99]:
with model:
trace = pm.sample(25000, njobs=4)
In [100]:
pm.plot_posterior(trace[3000:],
varnames=['Bombings_Pak_mean', 'Bombings_Afg_mean', 'Bombings_Pak_std', 'Bombings_Afg_std'],
color='#87ceeb')
Out[100]:
In [101]:
pm.plot_posterior(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'],
ref_val=0,
color='#87ceeb')
Out[101]:
In [102]:
pm.summary(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'])
In [53]:
afgtd = dfwhole[dfwhole.country_txt=='Afghanistan']
In [59]:
plt.hist(afgtd['iyear'])
Out[59]:
In [55]:
pkgtd = dfwhole[dfwhole.country_txt=='Pakistan']
In [56]:
pkgtd.tail(2)
Out[56]:
In [66]:
pkgtd['attacktype1_txt'].value_counts().head(5)
Out[66]:
In [58]:
plt.hist(pkgtd['iyear'])
Out[58]:
Methodology and Analysis Global Terrorism Database
Methodology:
My Bayesian approach was to create a prior with all bombings/explosions from 1970 through 2000 in South Asia; using Pakistan and Afghanistan as my populations. For my two countries, I used bombings/explosions from 2001 through 2015. Through EDA, I saw number of these types of attacks were similar for both country; spiking in the last years of the dataset (2010-2015). These attacks placed Pakistan second (with 12,768) in overall bombings/explosions and Afghanistan (with 9690) fourth. Three of the top 5 countries are classified as South Asia in the dataset. Even though India is third (with 9940) I believed the connection that Afghanistan and Pakistan may or may not have some estimation population difference. Given the model results I seek alternative analysis. The means of both populations was high and closer together which was not showing how different they are. My summaries of the posterior distributions of the parameters were not statistically significant. The difference between Pakistan and Afghanistan, given the prior I created, was insignificant. If run again, India would be considered as an option with Pakistan. Another option is to use Iraq and Afghanistan as the parameters to see if the Bayesian approach is more successful.
Predicting the 1993 bombings/explosions:
I chose 3 years before 1993 (1990-1992) and 3 years after (1994-1996). I calculated the mean of the six year’s bombings/explosions numbers. It was 1,436. I also took the number of bombings/explosions overall years (75,963) and divided it by the number of years (44) in the dataset (excluding 1993). This result was 1,726. I would say an average of these may be overfitting to this dataset. Three years before and after allows enough data to garner a healthy mean to use for 1993. The overall mean of 1726 gives a safe range for the data. The best estimate for the number of bombings/explosions for 1993 is 1,436.