In [170]:
import viz
import datetime

import numpy as np
import pandas as pd
import pycountry as pyc

from sklearn import cluster
from sklearn import metrics
from sklearn import linear_model
from tqdm import tqdm_notebook as tqdm
from imp import reload
from matplotlib import pyplot as plt
from viz import world_map_figure

import plotly
import plotly.plotly as py
from plotly.graph_objs import Choropleth, Bar, Scatter
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML

In [2]:
init_notebook_mode(connected=True)


MOST FREQUENT COUNTRIES


In [3]:
most_common_countries = pd.read_csv('../data/viz/most_common_countries.csv', header=None, names=['Country', 'Count'])

In [4]:
show_top = 10

In [5]:
most_common_countries['Percentage'] = most_common_countries['Count'] / most_common_countries['Count'].sum()

In [433]:
most_common_countries_data_plot = [Bar(
    x=most_common_countries['Country'][:show_top],
    y=most_common_countries['Percentage'][:show_top]
)]

most_common_countries_fig = {
    'data': most_common_countries_data_plot,
    'layout': {
        'title': 'Most Common Countries',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'xaxis': {
            'title': 'Country',            
        },
        'yaxis': {
            'title': 'Proportion of total countries',           
        }
    }
}
    
iplot(most_common_countries_fig)



In [524]:
#print(plotly.offline.plot(most_common_countries_fig, include_plotlyjs=False, output_type='div'))

MOST FREQUENT UNKNOWN COUNTRIES


In [8]:
most_unknown_websites = pd.read_csv('../data/viz/most_unknown_websites.csv', header=None, names=['URL', 'Count'])

In [9]:
show_top = 10

In [10]:
most_unknown_websites['Percentage'] = most_unknown_websites['Count'] / most_unknown_websites['Count'].sum()

In [502]:
most_unknown_websites_data_plot = [Bar(
    x=most_unknown_websites['URL'][:show_top],
    y=most_unknown_websites['Percentage'][:show_top],
    )]

most_unknown_websites_fig = {
    'data': most_unknown_websites_data_plot,
    'layout': {
        'title': 'Most Frequent Unknown Websites',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'xaxis': {
            'title': 'Website',            
        },
        'yaxis': {
            'title': 'Proportion of total websites',           
        }
    }
}

iplot(most_unknown_websites_fig)



In [525]:
#print(plotly.offline.plot(most_unknown_websites_fig, include_plotlyjs=False, output_type='div'))

LOAD ALL DATA


In [13]:
def select_events(df, feature, selector):
    '''Example of use : select_events(selected_df, 'EventCode', lambda x: x[:2] == '08')'''
    return df[df[feature].apply(selector)]

In [14]:
all_cca = [c.alpha_3 for c in pyc.countries]
all_cca_set = set(all_cca)

In [15]:
start_date = datetime.datetime(2015, 3, 1)
end_date = datetime.datetime(2017, 12, 1)

n_months = (end_date - start_date).days * 12 // 365

dates = []
for i in range(n_months):
    index = start_date.month - 1 + i
    month = index % 12 + 1
    year = start_date.year + index // 12
    date = "{}_{:02d}".format(year, month)
    dates.append(date)
    
dates_set = set(dates)

In [16]:
df = pd.read_csv('../data/final_data.csv', encoding='utf-8')

In [17]:
df = select_events(df, 'Target_CountryCode', lambda x: x in all_cca_set)

In [18]:
df = select_events(df, 'Source_CountryCode', lambda x: x in all_cca_set)

In [19]:
df['Year_Month'] = df['Day'].apply(str).apply(lambda x: x[:4] + '_' + x[4:6])

In [20]:
df = select_events(df, 'Year_Month', lambda x: x in dates_set)

WORLD MAP


In [22]:
reload(viz)


Out[22]:
<module 'viz' from 'C:\\Users\\Greg\\Programming\\Python\\ada2017\\project\\src\\viz.py'>

In [275]:
colorscale_perso = [[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]

colorscale_perso1 = [[0.0, '0066CC'], [1, 'FFFFFF']]#, [0.2, 'F9DBBD'], [0.3, 'rgb(253,174,97)'], [0.4, 'rgb(254,224,144)'], [0.5, 'rgb(224,243,248)'], [0.6, 'rgb(171,217,233)'], [0.7, 'rgb(116,173,209)'], [0.8, 'rgb(69,117,180)'], [0.9, 'rgb(49,54,149)'], [1.0, 'rgb(49,54,149)']]

default_colorscale = [[0,'"rgb(5, 10, 172)"'],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],

AVG TONE of Target_CountryCode per month


In [24]:
# Pivot on countries and average on AvgTone for each month

df_tone_target = pd.pivot_table(df, values='AvgTone', index=['Target_CountryCode'], columns=['Year_Month'], aggfunc=np.median)

In [277]:
zmin_tone_target = df_tone_target.min().max()
zmax_tone_target = df_tone_target.max().min()

zmin_tone_target, zmax_tone_target


Out[277]:
(-3.8285918479571897, 1.1235955056179798)

In [290]:
figure_tone_target = world_map_figure(title='Average Tone Evolution',
                                      title_colorscale='Median<br>Average Tone',
                                      frames_title=dates,
                                      df=df_tone_target.dropna().reset_index(),
                                      locations_col='Target_CountryCode',
                                      txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
                                      zmin=zmin_tone_target,
                                      zmax=zmax_tone_target, 
                                      colorscale=colorscale_perso1)

iplot(figure_tone_target, validate=False)



In [526]:
#print(plotly.offline.plot(figure_tone_target, include_plotlyjs=False, output_type='div'))

GOLDSTEIN of Target_CountryCode per month


In [283]:
df['GoldsteinScalePondered'] = df['GoldsteinScale'] * df['NumMentions']

In [358]:
# Pivot on countries and average on AvgTone for each month

df_gs_target = pd.pivot_table(df, values='GoldsteinScale', index=['Target_CountryCode'], columns=['Year_Month'], aggfunc=np.mean)

In [359]:
zmin_gs_target = df_gs_target.min().median()
zmax_gs_target = df_gs_target.max().median()

zmin_gs_target, zmax_gs_target


Out[359]:
(-2.001569300557773, 2.7349056603773563)

In [361]:
figure_gs_target = world_map_figure(title='Goldstein Scale Evolution',
                                    title_colorscale='Median Pondered <br> Goldstein Scale',
                                    frames_title=dates,
                                    df=df_gs_target.dropna().reset_index(),
                                    locations_col='Target_CountryCode',
                                    txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
                                    zmin=zmin_gs_target,
                                    zmax=zmax_gs_target, 
                                    colorscale=colorscale_perso1)

iplot(figure_gs_target, validate=False)



In [527]:
#print(plotly.offline.plot(figure_gs_target, include_plotlyjs=False, output_type='div'))

In [35]:
%telepyth 'REALLY DONE 2'


Out[35]:
'REALLY DONE 2'

choropleth with only the US events (or another country, RUSSIA, UKRAINE, FRANCE) and group by source country


In [292]:
def tone_focus_on(df, code):
    df_target = select_events(df, 'Target_CountryCode', lambda x: x == code)
    return pd.pivot_table(df_target, values='AvgTone', index=['Source_CountryCode'], columns=['Year_Month'], aggfunc=np.mean).dropna()

In [293]:
df_tone_usa = tone_focus_on(df, 'USA')

In [297]:
zmin_tone_usa = df_tone_usa.min().max()
zmax_tone_usa = df_tone_usa.max().min()

zmin_tone_usa, zmax_tone_usa


Out[297]:
(-3.4250309954824449, 0.46150679794867328)

In [298]:
reload(viz)


Out[298]:
<module 'viz' from 'C:\\Users\\Greg\\Programming\\Python\\ada2017\\project\\src\\viz.py'>

In [299]:
figure_tone_us = world_map_figure(title='AvgTone toward/against USA - Evolution',
                                    title_colorscale='Median<br>AvgTone',
                                    frames_title=dates,
                                    df=df_tone_usa.reset_index(),
                                    locations_col='Source_CountryCode',
                                    txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
                                    zmin=zmin_tone_usa,
                                    zmax=zmax_tone_usa, 
                                    colorscale=colorscale_perso1)

iplot(figure_tone_us, validate=False)



In [528]:
#print(plotly.offline.plot(figure_tone_us, include_plotlyjs=False, output_type='div'))

insert plot with 3-4 countries (CHE, FRA, and others) with focus on US or RUSSIA


In [337]:
def approx(y, degree=6):
    X = np.arange(len(y))
    X = X.reshape(X.shape + (1,))
    X = np.concatenate([X ** i for i in range(degree)], axis=1)
    #X -= X.mean()
    #X /= X.std()
    smoothing_model = linear_model.Lasso()
    smoothing_model.fit(X, y)
    return smoothing_model.predict(X), smoothing_model.coef_

In [323]:
def colors(c):
    if c == 'France':
        return 'rgb(22, 96, 167)'
    elif c == 'Switzerland':
        return 'rgb(205, 12, 24)'
    else:
        return 'rgb(0, 0, 0)'

In [324]:
def build_trace(x, y, name, polynomial_approx, mode='lines'):    
    if polynomial_approx:
        y, _ = approx(y)

    return Scatter(
        x = x,
        y = y,
        mode = mode,
        name = name + (' (approx)' if polynomial_approx else ''),
        line = {
            'color': colors(name)
        }
    )

In [493]:
def traces_to_fig(title, traces, xaxis='', yaxis=''):
    return {
        'data': traces,
        'layout': {
            'title': title,
            'paper_bgcolor': 'rgba(0, 0, 0, 0)',
            'plot_bgcolor': 'rgba(0, 0, 0, 0)',
            'xaxis': {
                'title': xaxis,            
            },
            'yaxis': {
                'title': yaxis,           
            }
        },
        
    }

In [500]:
def trends_to_fig(df, countries, title, xaxis, yaxis, centered=False, polynomial_approx=False):  
    y = df.copy()
    
    if centered:
        y -= y.mean()
    
    traces = [build_trace([d.replace('_', '/') for d in dates], y.loc[c], pyc.countries.get(alpha_3=c).name, poly) for c in some_countries for poly in set([False, polynomial_approx])]

    fig = traces_to_fig(title, traces, xaxis, yaxis)

    return fig

In [501]:
some_countries = ['CHE', 'FRA', 'MEX']
title = 'Trends in the Average Tone used to relate the events happening in the USA(common trend removed) (with approximation)'
xaxis = 'Date'
yaxis = 'Average Tone'
fig_trends = trends_to_fig(df_tone_usa, some_countries, title, xaxis, yaxis, centered=True, polynomial_approx=True)
iplot(fig_trends)


C:\Users\Greg\Anaconda3\envs\py36\lib\site-packages\sklearn\linear_model\coordinate_descent.py:491: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.


In [529]:
#print(plotly.offline.plot(fig_trends, include_plotlyjs=False, output_type='div'))

In [80]:
def df_to_weights(df, countries):
    dict_ = {}
    
    bias = df.mean()
    bias -= bias.mean()
    bias /= bias.std()
    
    for code, y in zip(df.index, df.values):
        if code in countries:
            y -= y.mean()
            y /= y.std()
            y -= bias
            _, weights = approx(y, degree=4)
            dict_[code] = weights
    return dict_

In [400]:
def many_df_to_weights(df_list):
    all_weights = {}
    all_countries = set()
    
    for df in df_list:
        if len(all_countries) == 0:
            all_countries = set(df.index)
        else:
            all_countries = all_countries & set(df.index)

    for df in df_list:
        new_weights = df_to_weights(df, all_countries)
        for code in new_weights:
            if code in all_countries:
                w = new_weights[code]
                if code in all_weights:
                    all_weights[code] = np.append(all_weights[code], w)
                else:
                    all_weights[code] = w
                    
    codes = np.array(list(all_weights.keys()))
    weights = np.array(list(all_weights.values()))
    
    return codes, weights

In [401]:
def many_df_to_labels(df_list):
    codes, weights = many_df_to_weights(df_list)
    
    clusters = cluster.SpectralClustering(n_clusters=2)
    labels = clusters.fit(weights).labels_
    
    return codes, labels

In [403]:
all_clusters_df = None
for c in ['USA', 'CHN', 'FRA']:
    codes, cluster_idx = many_df_to_labels([tone_focus_on(df, c)])
    clusters_df = pd.DataFrame(cluster_idx, index=codes, columns=[c])
    if all_clusters_df is None:
        all_clusters_df = clusters_df
    else:
        all_clusters_df = pd.concat([all_clusters_df, clusters_df], axis=1, join='inner')

In [404]:
all_clusters_df.shape


Out[404]:
(110, 3)

In [422]:
reload(viz)


Out[422]:
<module 'viz' from 'C:\\Users\\Greg\\Programming\\Python\\ada2017\\project\\src\\viz.py'>

In [522]:
cs = [
        [0, 'rgb(255, 0, 0)'],
        [0.1, 'rgb(255, 0, 0)'],

        [0.1, 'rgb(0, 0, 255)'],
        [1.0, 'rgb(0, 0, 255)']
    ]

map_clusters = world_map_figure(title='Spectral Clustering',
                                title_colorscale='colobar',
                                frames_title=['CHN', 'FRA', 'USA'],
                                df=all_clusters_df.reset_index(),
                                locations_col='index',
                                txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
                                zmin=0,
                                zmax=1, 
                                colorscale=cs,
                                showscale=False,
                               )

iplot(map_clusters, validate=False)



In [530]:
#print(plotly.offline.plot(map_clusters, include_plotlyjs=False, output_type='div'))

In [504]:
codes, cluster_idx = many_df_to_labels([tone_focus_on(df, c) for c in ['USA', 'RUS', 'FRA', 'UKR']])
clusters_df_grouped = pd.DataFrame(cluster_idx, index=codes, columns=['USA RUS FRA UKR'])

In [519]:
map_clusters_grouped = world_map_figure(title='Spectral Clustering<br>(aggregated with USA RUS FRA UKR)',
                                title_colorscale='colobar',
                                frames_title=['USA RUS FRA UKR'],
                                df=clusters_df_grouped.reset_index(),
                                locations_col='index',
                                txt_fn=lambda code: pyc.countries.get(alpha_3=code).name,
                                zmin=0,
                                zmax=1, 
                                colorscale=cs,
                                showscale=False,
                               )

iplot(map_clusters_grouped, validate=False)



In [531]:
#print(plotly.offline.plot(map_clusters_grouped, include_plotlyjs=False, output_type='div'))

In [92]:
def get_silhouette(X, i):
    clusters = cluster.SpectralClustering(n_clusters=i)
    clusters.fit(X)
    labels = clusters.labels_
    return metrics.silhouette_score(X, labels, metric='euclidean')

In [186]:
source_and_target_countries = list(set(df['Source_CountryCode'].values) & set(df['Target_CountryCode'].values))

In [206]:
all_weights = []
for code in tqdm(source_and_target_countries):
    df_tone = tone_focus_on(df, code)
    weights = many_df_to_weights([df_tone])[1]
    all_weights.append(weights)
    #print(weights.shape)
    #if len(weights) > 5:
    #    sil = np.array([get_silhouette(weights, i) for i in range(2, 6)])
    #    all_silhouettes.append(sil)



Exception in thread Thread-14:
Traceback (most recent call last):
  File "C:\Users\Greg\Anaconda3\envs\py36\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\Greg\Anaconda3\envs\py36\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\Greg\Anaconda3\envs\py36\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



In [418]:
all_silhouettes = []
for w, c in tqdm(list(zip(all_weights, source_and_target_countries))):
    if len(w) > 100:
        sil = np.array([get_silhouette(w, i) for i in range(2, 6)])
        all_silhouettes.append((c, sil))




In [517]:
x = list(range(2, 9))
df_tone = tone_focus_on(df, 'AUS')
y = np.array([get_silhouette(many_df_to_weights([df_tone])[1], i) for i in x])
del df_tone

In [518]:
traces = [Scatter(x=x, y=y)]
fig = traces_to_fig('Silhouette', traces, 'Number of clusters', 'Silhouette score')

iplot(fig)



In [532]:
#print(plotly.offline.plot(fig, include_plotlyjs=False, output_type='div'))