In [2]:
%matplotlib qt4
from __future__ import division

from collections import defaultdict

from models import tools, optimize, models, filters
from models.tests import PerformanceTest

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

sns.set_style("ticks", {"legend.frameon": True})
mpl.rcParams['axes.color_cycle'] = ['#02A5F4', 'orange', 'green']

In [5]:
data1 = tools.load_data(limit=500000, offset=1000000)


Loaded 469545 answers.

In [14]:
data2 = tools.load_data(limit=2000000, offset=1000000)


Loaded 1958367 answers.

In [6]:
data_by_place = {}

In [10]:
data_by_place['European Countries'] = data1[filters.european_countries(data1)]
data_by_place['African Countries'] = data1[filters.african_countries(data1)]
data_by_place['American Countries'] = data1[filters.american_countries(data1)]
data_by_place['Asian Countries'] = data1[filters.asian_countries(data1)]
data_by_place['USA States'] = data1[filters.usa_states(data1)]


Asian Countries 57890
USA States 20505
African Countries 65073
American Countries 33507
European Countries 93497

In [23]:
data_by_place['Rivers'] = data1[filters.place_type(data1, 'river')]
data_by_place['Lakes'] = data2[filters.place_type(data2, 'lake')]
data_by_place['Mountains'] = data1[filters.place_type(data1, 'mountains')]
data_by_place['Islands'] = data2[filters.place_type(data2, 'island')]
data_by_place['Czech Regions'] = data2[filters.place_type(data2, 'region (cz)')]

In [135]:
for name, df in data_by_place.items():
    print name, len(df)


Mountains 34699
Lakes 5906
African Countries 65073
Asian Countries 57890
Czech Regions 98061
USA States 20505
European Countries 93497
Islands 48311
Rivers 24466
American Countries 33507

In [136]:
for name, df in data_by_place.items():
    print name, len(set(df['place_id']))


Mountains 71
Lakes 21
African Countries 50
Asian Countries 45
Czech Regions 22
USA States 48
European Countries 38
Islands 56
Rivers 57
American Countries 30

In [139]:
by_place_results = {}
for name, df in data_by_place.items():
    descent = optimize.GradientDescent(df)
    by_place_results[name] = descent.search_staircase(
        init_learn_rate=0.015, number_of_iter=5, echo_iterations=False,
        random_factor=0.1, random_chance=500)
    best = by_place_results[name].best
    result = 'g={}, d={}'.format(best['gamma'], -best['delta'])
    tools.echo(name + ' done! ' + result, clear=False)


Mountains done! g=2.64569643715, d=0.014128079355
Lakes done! g=2.94496471974, d=-0.0630996853292
African Countries done! g=2.19307673711, d=-0.601926571971
Asian Countries done! g=2.21835300051, d=-0.480974698753
Czech Regions done! g=2.09855824467, d=-0.341026311846
USA States done! g=2.78393012742, d=-0.522292403817
European Countries done! g=2.03677208368, d=-0.227876327308
Islands done! g=2.72584015372, d=-0.862503779152
Rivers done! g=2.9697459919, d=-0.760195475488
American Countries done! g=2.92048352401, d=-0.366094748887

In [147]:
fig = plt.figure(num=None, figsize=(8, 6), dpi=120)

labels = []
X, Y, S = [], [], []

for name, df in data_by_place.items():
    labels += [name]
    S += [len(set(df['place_id']))]
    Y += [by_place_results[name].best['gamma']]
    X += [-by_place_results[name].best['delta']]

plt.scatter(X, Y, s=[s * 15 for s in S], c='#34b7f6', marker='o')
plt.xlim([-1.05, 0.1])
plt.xlabel(r'$\delta$', fontsize=15)
plt.ylabel(r'$\gamma$', fontsize=15)

for i in range(len(X)):
    label = labels[i]
    xy = [X[i], Y[i]]
    xytext = (-18, 18)
    plt.annotate(
        label, xy=xy, xytext=xytext,
        textcoords='offset points', ha='right',
        bbox=dict(boxstyle='round,pad=0.4', fc='white', alpha=0.5, linewidth=1))
    plt.annotate(str(S[i]), xy=xy, ha='center', va='center')

plt.show()
plt.tight_layout()

In [ ]: