In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from IPython.display import display
from dateutil import parser
plt.style.use('ggplot')
%matplotlib inline
In [2]:
%%time
seekers = pd.read_csv('asylum_seekers_monthly.csv')
# Fill missing values with consistent value
seekers = seekers.fillna(value = "")
# Replace all '*' with ''
seekers = seekers.replace(['*'], [''])
# Convert 'Year and Value' to numeric
seekers['Year'] = pd.to_numeric(seekers['Year'], errors='coerce')
seekers['Value'] = pd.to_numeric(seekers['Value'], errors='coerce')
seekers.head()
# seekers.Origin.unique()
def parse_date(row):
yyyymm = ('%s %s 01' % (row['Year'], row['Month']))
return parser.parse(yyyymm)
seekers['YYYYMM'] = seekers.apply(parse_date, axis=1)
seekers.to_csv('asylum_seekers_monthly.csv')
In [3]:
seekers = pd.read_csv('asylum_seekers_monthly.csv')
seekers.head()
Out[3]:
In [4]:
seekers.head()
# seekers['YYYYMM']
monthly = seekers[seekers['Origin'].isin(['Syrian Arab Rep.', 'Iraq'])]
monthly.set_index('YYYYMM', inplace=True)
group = monthly.groupby(['Origin', 'YYYYMM'], as_index=False)
total = monthly.groupby(['Origin', 'YYYYMM'])[['Value']].aggregate('sum')
data = {'Iraq' :[],
'Syrian Arab Rep.' :[]}
for index, row in total.iterrows():
data[index[0]].append([row[0], index[1]])
# print(data['Iraq'])
df_iraq = pd.DataFrame(data['Iraq'])
df_iraq.set_index(1, inplace=True)
df_syria = pd.DataFrame(data['Syrian Arab Rep.'])
df_syria.set_index(1, inplace=True)
# df_iraq
fig = plt.figure(figsize=(12,8))
ax = plt.axes()
df_iraq.plot(ax=ax, label='Iraq')
df_syria.plot(ax=ax, label='Syria')
plt.title("Assylum-Seeker Monthly Counts by Country (2000-2016)")
ax.set_xlabel('Month')
ax.set_ylabel('Number of Refugee')
ax.legend(['Iraq', 'Syria'])
Out[4]:
In [5]:
filter_df = seekers[seekers['Origin'].isin(['Iraq','Syrian Arab Rep.'])]
filter_df = seekers.groupby(['Origin', 'Year'])[['Value']].aggregate('sum')
iraq = filter_df.query("Origin == 'Iraq'")
syria = filter_df.query("Origin == 'Syrian Arab Rep.'")
# iraq['Year']
filter_df
fig = plt.figure(figsize=(12,8))
ax = plt.axes()
# filter_df.plot()
plt.plot(seekers.Year.unique(), iraq['Value'], label = 'Iraq')
plt.plot(seekers.Year.unique(), syria['Value'], label = 'Syria')
plt.title("Assylum-Seeker Counts by Country (2000-2016)")
ax.set_xlabel('Year')
ax.set_ylabel('Number of Refugee')
plt.xticks(seekers.Year.unique())
ax.legend(frameon=False)
Out[5]:
In [6]:
# seekers.head()
# seekers['YYYYMM']
monthly = seekers[seekers['Origin'].isin(['Syrian Arab Rep.', 'Iraq'])]
monthly.set_index('YYYYMM', inplace=True)
group = monthly.groupby(['Origin', 'YYYYMM'], as_index=False)
total = monthly.groupby(['Origin', 'YYYYMM'])[['Value']].aggregate('sum')
data = {'Iraq' :[],
'Syrian Arab Rep.' :[]}
for index, row in total.iterrows():
data[index[0]].append([row[0], index[1]])
# print(data['Iraq'])
df_iraq = pd.DataFrame(data['Iraq'])
df_iraq.columns = ['value', 'YYYYMM']
df_iraq.set_index('YYYYMM', inplace=True)
df_syria = pd.DataFrame(data['Syrian Arab Rep.'])
df_syria.columns = ['value', 'YYYYMM']
df_syria.set_index('YYYYMM', inplace=True)
fig = plt.figure(figsize=(12,8))
ax = plt.axes()
df_iraq.loc[df_iraq.index > '2012-01-01'].plot(ax=ax, label='Iraq')
df_syria.loc[df_syria.index > '2012-01-01'].plot(ax=ax, label='Syria')
plt.title("Assylum-Seeker Monthly Counts by Country (2000-2016)")
ax.set_xlabel('Month')
ax.set_ylabel('Number of Refugee')
ax.legend(['Iraq', 'Syria'])
df_iraq.head()
df_iraq.columns
# df_iraq[df_iraq['YYYYMM'] > '2012-01-01'].head()
df_iraq.loc[df_iraq.index > '2012-01-01'].head()
Out[6]:
In [7]:
######################################################################
data_train = pd.read_csv('result_all_windows_labels.csv')
# Cleanup - remove no labels
data_train = data_train[data_train['label'].notnull()]
data_train = data_train[data_train.label != 'environmental']
data_train = data_train[data_train.label != 'religious']
data_train = data_train[data_train.label != 'economical']
label_cat = {'violence/terrorism' : 1, 'misc': 2, 'political': 3,
# 'religious': 4, 'economical': 5, 'environmental': 6
}
print(label_cat)
def to_category(x):
return label_cat[x]
data_train['target'] = data_train.apply(lambda row: to_category(row['label']), axis=1)
data_train['target'].plot.hist(alpha=0.5)
texts = []
# Get corpus by joining all keywords
for index, row in data_train.iloc[ :, 2:32].iterrows():
texts.append(u' '.join(row.tolist()))
data_train['topicFlat'] = texts
labels = data_train['target']
# print(labels)
data_train['topicFlat'].head()
Out[7]:
In [8]:
fig = plt.figure(figsize=(12,8))
ax = plt.axes()
df_syria = df_syria.loc[(df_syria.index > '2012-01-01') & (df_syria.index < '2017-06-01')]
df_iraq = df_iraq.loc[(df_iraq.index > '2012-01-01') & (df_iraq.index < '2017-06-01')]
df_iraq.plot(ax=ax, label='Iraq')
df_syria.plot(ax=ax, label='Syria')
def parse_date(row):
year = row[0].split('_')[0]
month = row[0].split('_')[1]
yyyymm = ('%s %s 01' % (year, month))
return parser.parse(yyyymm)
data_train['YYYYMM'] = data_train.apply(parse_date, axis=1)
# data_train = data_train[data_train.label == 'political']
data_train = data_train[data_train.label == 'violence/terrorism']
# df = data_train.groupby(['YYYYMM', 'target']).count()
plt.title("Assylum-Seeker Monthly Counts by Country (2000-2016)")
ax.set_xlabel('Month')
ax.set_ylabel('Number of Refugee')
ax.legend(['Iraq', 'Syria', 'political'])
df = data_train.groupby(['YYYYMM']).size()
# df_syria.index
fig = plt.figure(figsize=(12,8))
ax = plt.axes()
df = df[df.index < '2017-06-01']
df = df.apply(lambda x: x * 1000)
df.plot(ax=ax)
plt.title("Monthly Counts of articles related to violence (2012-2017)")
ax.set_xlabel('Month')
ax.set_ylabel('Number of Articles')
ax.legend(['Violence'])
Out[8]:
In [9]:
corre_df = pd.DataFrame({'syria': df_syria.value, 'iraq': df_iraq.value, 'violence': df.values})
df.head()
# df_syria.head()
# data_train.head()
fig = plt.figure(figsize=(12,8))
import seaborn as sns
corr = corre_df.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
corre_df.head()
Out[9]:
In [ ]: