In [170]:
import viz
import datetime
import numpy as np
import pandas as pd
import pycountry as pyc
from sklearn import cluster
from sklearn import metrics
from sklearn import linear_model
from tqdm import tqdm_notebook as tqdm
from imp import reload
from matplotlib import pyplot as plt
from viz import world_map_figure
import plotly
import plotly.plotly as py
from plotly.graph_objs import Choropleth, Bar, Scatter
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML
In [2]:
init_notebook_mode(connected=True)
In [3]:
most_common_countries = pd.read_csv('../data/viz/most_common_countries.csv', header=None, names=['Country', 'Count'])
In [4]:
show_top = 10
In [5]:
most_common_countries['Percentage'] = most_common_countries['Count'] / most_common_countries['Count'].sum()
In [433]:
most_common_countries_data_plot = [Bar(
x=most_common_countries['Country'][:show_top],
y=most_common_countries['Percentage'][:show_top]
)]
most_common_countries_fig = {
'data': most_common_countries_data_plot,
'layout': {
'title': 'Most Common Countries',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'xaxis': {
'title': 'Country',
},
'yaxis': {
'title': 'Proportion of total countries',
}
}
}
iplot(most_common_countries_fig)
In [524]:
#print(plotly.offline.plot(most_common_countries_fig, include_plotlyjs=False, output_type='div'))
In [8]:
most_unknown_websites = pd.read_csv('../data/viz/most_unknown_websites.csv', header=None, names=['URL', 'Count'])
In [9]:
show_top = 10
In [10]:
most_unknown_websites['Percentage'] = most_unknown_websites['Count'] / most_unknown_websites['Count'].sum()
In [502]:
most_unknown_websites_data_plot = [Bar(
x=most_unknown_websites['URL'][:show_top],
y=most_unknown_websites['Percentage'][:show_top],
)]
most_unknown_websites_fig = {
'data': most_unknown_websites_data_plot,
'layout': {
'title': 'Most Frequent Unknown Websites',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'xaxis': {
'title': 'Website',
},
'yaxis': {
'title': 'Proportion of total websites',
}
}
}
iplot(most_unknown_websites_fig)
In [525]:
#print(plotly.offline.plot(most_unknown_websites_fig, include_plotlyjs=False, output_type='div'))
In [13]:
def select_events(df, feature, selector):
'''Example of use : select_events(selected_df, 'EventCode', lambda x: x[:2] == '08')'''
return df[df[feature].apply(selector)]
In [14]:
all_cca = [c.alpha_3 for c in pyc.countries]
all_cca_set = set(all_cca)
In [15]:
start_date = datetime.datetime(2015, 3, 1)
end_date = datetime.datetime(2017, 12, 1)
n_months = (end_date - start_date).days * 12 // 365
dates = []
for i in range(n_months):
index = start_date.month - 1 + i
month = index % 12 + 1
year = start_date.year + index // 12
date = "{}_{:02d}".format(year, month)
dates.append(date)
dates_set = set(dates)
In [16]:
df = pd.read_csv('../data/final_data.csv', encoding='utf-8')
In [17]:
df = select_events(df, 'Target_CountryCode', lambda x: x in all_cca_set)
In [18]:
df = select_events(df, 'Source_CountryCode', lambda x: x in all_cca_set)
In [19]:
df['Year_Month'] = df['Day'].apply(str).apply(lambda x: x[:4] + '_' + x[4:6])
In [20]:
df = select_events(df, 'Year_Month', lambda x: x in dates_set)
In [22]:
reload(viz)
Out[22]:
In [275]:
colorscale_perso = [[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]
colorscale_perso1 = [[0.0, '0066CC'], [1, 'FFFFFF']]#, [0.2, 'F9DBBD'], [0.3, 'rgb(253,174,97)'], [0.4, 'rgb(254,224,144)'], [0.5, 'rgb(224,243,248)'], [0.6, 'rgb(171,217,233)'], [0.7, 'rgb(116,173,209)'], [0.8, 'rgb(69,117,180)'], [0.9, 'rgb(49,54,149)'], [1.0, 'rgb(49,54,149)']]
default_colorscale = [[0,'"rgb(5, 10, 172)"'],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
[0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
In [24]:
# Pivot on countries and average on AvgTone for each month
df_tone_target = pd.pivot_table(df, values='AvgTone', index=['Target_CountryCode'], columns=['Year_Month'], aggfunc=np.median)
In [277]:
zmin_tone_target = df_tone_target.min().max()
zmax_tone_target = df_tone_target.max().min()
zmin_tone_target, zmax_tone_target
Out[277]:
In [290]:
figure_tone_target = world_map_figure(title='Average Tone Evolution',
title_colorscale='Median<br>Average Tone',
frames_title=dates,
df=df_tone_target.dropna().reset_index(),
locations_col='Target_CountryCode',
txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
zmin=zmin_tone_target,
zmax=zmax_tone_target,
colorscale=colorscale_perso1)
iplot(figure_tone_target, validate=False)
In [526]:
#print(plotly.offline.plot(figure_tone_target, include_plotlyjs=False, output_type='div'))
In [283]:
df['GoldsteinScalePondered'] = df['GoldsteinScale'] * df['NumMentions']
In [358]:
# Pivot on countries and average on AvgTone for each month
df_gs_target = pd.pivot_table(df, values='GoldsteinScale', index=['Target_CountryCode'], columns=['Year_Month'], aggfunc=np.mean)
In [359]:
zmin_gs_target = df_gs_target.min().median()
zmax_gs_target = df_gs_target.max().median()
zmin_gs_target, zmax_gs_target
Out[359]:
In [361]:
figure_gs_target = world_map_figure(title='Goldstein Scale Evolution',
title_colorscale='Median Pondered <br> Goldstein Scale',
frames_title=dates,
df=df_gs_target.dropna().reset_index(),
locations_col='Target_CountryCode',
txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
zmin=zmin_gs_target,
zmax=zmax_gs_target,
colorscale=colorscale_perso1)
iplot(figure_gs_target, validate=False)
In [527]:
#print(plotly.offline.plot(figure_gs_target, include_plotlyjs=False, output_type='div'))
In [35]:
%telepyth 'REALLY DONE 2'
Out[35]:
In [292]:
def tone_focus_on(df, code):
df_target = select_events(df, 'Target_CountryCode', lambda x: x == code)
return pd.pivot_table(df_target, values='AvgTone', index=['Source_CountryCode'], columns=['Year_Month'], aggfunc=np.mean).dropna()
In [293]:
df_tone_usa = tone_focus_on(df, 'USA')
In [297]:
zmin_tone_usa = df_tone_usa.min().max()
zmax_tone_usa = df_tone_usa.max().min()
zmin_tone_usa, zmax_tone_usa
Out[297]:
In [298]:
reload(viz)
Out[298]:
In [299]:
figure_tone_us = world_map_figure(title='AvgTone toward/against USA - Evolution',
title_colorscale='Median<br>AvgTone',
frames_title=dates,
df=df_tone_usa.reset_index(),
locations_col='Source_CountryCode',
txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
zmin=zmin_tone_usa,
zmax=zmax_tone_usa,
colorscale=colorscale_perso1)
iplot(figure_tone_us, validate=False)
In [528]:
#print(plotly.offline.plot(figure_tone_us, include_plotlyjs=False, output_type='div'))
In [337]:
def approx(y, degree=6):
X = np.arange(len(y))
X = X.reshape(X.shape + (1,))
X = np.concatenate([X ** i for i in range(degree)], axis=1)
#X -= X.mean()
#X /= X.std()
smoothing_model = linear_model.Lasso()
smoothing_model.fit(X, y)
return smoothing_model.predict(X), smoothing_model.coef_
In [323]:
def colors(c):
if c == 'France':
return 'rgb(22, 96, 167)'
elif c == 'Switzerland':
return 'rgb(205, 12, 24)'
else:
return 'rgb(0, 0, 0)'
In [324]:
def build_trace(x, y, name, polynomial_approx, mode='lines'):
if polynomial_approx:
y, _ = approx(y)
return Scatter(
x = x,
y = y,
mode = mode,
name = name + (' (approx)' if polynomial_approx else ''),
line = {
'color': colors(name)
}
)
In [493]:
def traces_to_fig(title, traces, xaxis='', yaxis=''):
return {
'data': traces,
'layout': {
'title': title,
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'xaxis': {
'title': xaxis,
},
'yaxis': {
'title': yaxis,
}
},
}
In [500]:
def trends_to_fig(df, countries, title, xaxis, yaxis, centered=False, polynomial_approx=False):
y = df.copy()
if centered:
y -= y.mean()
traces = [build_trace([d.replace('_', '/') for d in dates], y.loc[c], pyc.countries.get(alpha_3=c).name, poly) for c in some_countries for poly in set([False, polynomial_approx])]
fig = traces_to_fig(title, traces, xaxis, yaxis)
return fig
In [501]:
some_countries = ['CHE', 'FRA', 'MEX']
title = 'Trends in the Average Tone used to relate the events happening in the USA(common trend removed) (with approximation)'
xaxis = 'Date'
yaxis = 'Average Tone'
fig_trends = trends_to_fig(df_tone_usa, some_countries, title, xaxis, yaxis, centered=True, polynomial_approx=True)
iplot(fig_trends)
In [529]:
#print(plotly.offline.plot(fig_trends, include_plotlyjs=False, output_type='div'))
In [80]:
def df_to_weights(df, countries):
dict_ = {}
bias = df.mean()
bias -= bias.mean()
bias /= bias.std()
for code, y in zip(df.index, df.values):
if code in countries:
y -= y.mean()
y /= y.std()
y -= bias
_, weights = approx(y, degree=4)
dict_[code] = weights
return dict_
In [400]:
def many_df_to_weights(df_list):
all_weights = {}
all_countries = set()
for df in df_list:
if len(all_countries) == 0:
all_countries = set(df.index)
else:
all_countries = all_countries & set(df.index)
for df in df_list:
new_weights = df_to_weights(df, all_countries)
for code in new_weights:
if code in all_countries:
w = new_weights[code]
if code in all_weights:
all_weights[code] = np.append(all_weights[code], w)
else:
all_weights[code] = w
codes = np.array(list(all_weights.keys()))
weights = np.array(list(all_weights.values()))
return codes, weights
In [401]:
def many_df_to_labels(df_list):
codes, weights = many_df_to_weights(df_list)
clusters = cluster.SpectralClustering(n_clusters=2)
labels = clusters.fit(weights).labels_
return codes, labels
In [403]:
all_clusters_df = None
for c in ['USA', 'CHN', 'FRA']:
codes, cluster_idx = many_df_to_labels([tone_focus_on(df, c)])
clusters_df = pd.DataFrame(cluster_idx, index=codes, columns=[c])
if all_clusters_df is None:
all_clusters_df = clusters_df
else:
all_clusters_df = pd.concat([all_clusters_df, clusters_df], axis=1, join='inner')
In [404]:
all_clusters_df.shape
Out[404]:
In [422]:
reload(viz)
Out[422]:
In [522]:
cs = [
[0, 'rgb(255, 0, 0)'],
[0.1, 'rgb(255, 0, 0)'],
[0.1, 'rgb(0, 0, 255)'],
[1.0, 'rgb(0, 0, 255)']
]
map_clusters = world_map_figure(title='Spectral Clustering',
title_colorscale='colobar',
frames_title=['CHN', 'FRA', 'USA'],
df=all_clusters_df.reset_index(),
locations_col='index',
txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
zmin=0,
zmax=1,
colorscale=cs,
showscale=False,
)
iplot(map_clusters, validate=False)
In [530]:
#print(plotly.offline.plot(map_clusters, include_plotlyjs=False, output_type='div'))
In [504]:
codes, cluster_idx = many_df_to_labels([tone_focus_on(df, c) for c in ['USA', 'RUS', 'FRA', 'UKR']])
clusters_df_grouped = pd.DataFrame(cluster_idx, index=codes, columns=['USA RUS FRA UKR'])
In [519]:
map_clusters_grouped = world_map_figure(title='Spectral Clustering<br>(aggregated with USA RUS FRA UKR)',
title_colorscale='colobar',
frames_title=['USA RUS FRA UKR'],
df=clusters_df_grouped.reset_index(),
locations_col='index',
txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
zmin=0,
zmax=1,
colorscale=cs,
showscale=False,
)
iplot(map_clusters_grouped, validate=False)
In [531]:
#print(plotly.offline.plot(map_clusters_grouped, include_plotlyjs=False, output_type='div'))
In [92]:
def get_silhouette(X, i):
clusters = cluster.SpectralClustering(n_clusters=i)
clusters.fit(X)
labels = clusters.labels_
return metrics.silhouette_score(X, labels, metric='euclidean')
In [186]:
source_and_target_countries = list(set(df['Source_CountryCode'].values) & set(df['Target_CountryCode'].values))
In [206]:
all_weights = []
for code in tqdm(source_and_target_countries):
df_tone = tone_focus_on(df, code)
weights = many_df_to_weights([df_tone])[1]
all_weights.append(weights)
#print(weights.shape)
#if len(weights) > 5:
# sil = np.array([get_silhouette(weights, i) for i in range(2, 6)])
# all_silhouettes.append(sil)
In [418]:
all_silhouettes = []
for w, c in tqdm(list(zip(all_weights, source_and_target_countries))):
if len(w) > 100:
sil = np.array([get_silhouette(w, i) for i in range(2, 6)])
all_silhouettes.append((c, sil))
In [517]:
x = list(range(2, 9))
df_tone = tone_focus_on(df, 'AUS')
y = np.array([get_silhouette(many_df_to_weights([df_tone])[1], i) for i in x])
del df_tone
In [518]:
traces = [Scatter(x=x, y=y)]
fig = traces_to_fig('Silhouette', traces, 'Number of clusters', 'Silhouette score')
iplot(fig)
In [532]:
#print(plotly.offline.plot(fig, include_plotlyjs=False, output_type='div'))