In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

Read data


In [8]:
filename = "../datasets/dades_oficials_ajBCN/ajuntament_bcn_formatted_data.csv"
df = pd.read_csv(filename, sep=';', encoding='utf-8', decimal=',')

Compute the evolution of the population for each neighbourhood


In [10]:
%matplotlib inline
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
save = open("clustering_neighbourhoods.csv", 'w')
save.write("neighbourhood;")
save.write("pop0;pop1;pop2;pop3;pop4;pop5;pop6;")
save.write("nat_pop0;nat_pop1;nat_pop2;nat_pop3;nat_pop4;nat_pop5;nat_pop6\n")
save.write("year;")
save.write("2010;2011;2012;2013;2014;2015;2016;")
save.write("2010;2011;2012;2013;2014;2015;2016\n")


# print evolution of population for neighbourhood
barris = np.unique(df["Barri"])
for barri in barris:
    evo_pop_df = df[(df["Barri"] == barri) & (df["Any"] >= 2010)].sort_values(by="Any")

    # get population and year
    year = np.array(evo_pop_df["Any"])
    population = np.array(evo_pop_df[u'Població'])

    # get natural evolution of the population
    population_natural_variation = np.copy(population)
    for now in range(1, population_natural_variation.size):
        then = now - 1
        add_people = evo_pop_df[evo_pop_df["Any"] == year[then]][u'Naixements Total'].astype(float)
        sub_people = evo_pop_df[evo_pop_df["Any"] == year[then]][u'Defuncions Total'].astype(float)
        population_natural_variation[now] = population_natural_variation[then] + add_people - sub_people


    fig = plt.figure(figsize=(9,9))
    gs = gridspec.GridSpec(1, 1)
    gs.update(left=0.2, bottom=0.15)
    ax = fig.add_subplot(gs[0])
    ax.set_title(barri, fontsize=30)
    ax.set_ylabel(u"Població", fontsize=25)
    ax.set_xlabel(u"Any", fontsize=25)
    ax.tick_params(which='both', labelsize=15, pad=10, size=10)
    ax.plot(year, population, label=u'evolució real')
    ax.plot(year, population_natural_variation, label=u'evolució natural')
    ax.legend(numpoints=1, prop={'size':18}, frameon=False, loc=1)
    fig.savefig(u"{}_evolucio_poblacio.png".format(barri.replace(".","_").strip()))
    
    save.write("{};".format(barri))
    save.write(";".join(population.astype(str)))
    save.write(";")
    save.write(";".join(population_natural_variation.astype(str)))
    save.write("\n")
    
save.close()


Compute the evolution of the population for each district


In [7]:
%matplotlib inline
import sys
reload(sys)
sys.setdefaultencoding('utf8')
save = open("clustering_district.csv", 'w')
save.write("district;")
save.write("pop0;pop1;pop2;pop3;pop4;pop5;pop6;")
save.write("nat_pop0;nat_pop1;nat_pop2;nat_pop3;nat_pop4;nat_pop5;nat_pop6\n")
save.write("year;")
save.write("2010;2011;2012;2013;2014;2015;2016;")
save.write("2010;2011;2012;2013;2014;2015;2016\n")


# print evolution of population for districts
dtes = np.unique(df["Dte."])
for dte in dtes:
    evo_pop_df = df[(df["Dte."] == dte) & (df["Any"] >= 2010)].sort_values(by="Any").groupby("Any").sum()

    # get population and year
    year = np.array(evo_pop_df.index)
    population = np.array(evo_pop_df[u'Població'])

    # get natural evolution of the population
    population_natural_variation = np.copy(population)
    for now in range(1, population_natural_variation.size):
        then = now - 1
        add_people = evo_pop_df[evo_pop_df.index == year[then]][u'Naixements Total'].astype(float)
        sub_people = evo_pop_df[evo_pop_df.index == year[then]][u'Defuncions Total'].astype(float)
        population_natural_variation[now] = population_natural_variation[then] + add_people - sub_people


    fig = plt.figure(figsize=(9,9))
    gs = gridspec.GridSpec(1, 1)
    gs.update(left=0.2, bottom=0.15)
    ax = fig.add_subplot(gs[0])
    ax.set_title("Dte. {}".format(dte), fontsize=30)
    ax.set_ylabel(u"Població", fontsize=25)
    ax.set_xlabel(u"Any", fontsize=25)
    ax.tick_params(which='both', labelsize=15, pad=10, size=10)
    ax.plot(year, population, label=u'evolució real')
    ax.plot(year, population_natural_variation, label=u'evolució natural')
    ax.legend(numpoints=1, prop={'size':18}, frameon=False, loc=1)
    fig.savefig(u"dte{}_evolucio_poblacio.png".format(dte))
    
    save.write("{};".format(dte))
    save.write(";".join(population.astype(str)))
    save.write(";")
    save.write(";".join(population_natural_variation.astype(str)))
    save.write("\n")
    
save.close()


Cluster neighbourhoods according to the evolution of their population


In [3]:
import pandas as pd
from sklearn import cluster

# load data
df = pd.read_csv("clustering_neighbourhoods.csv", sep=';')
df_write = pd.read_csv("clustering_neighbourhoods.csv", sep=';')
df = df[df["neighbourhood"] != "year"]        


# normalize populations dividing by the pop# and nat_pop# fields by the value stored in pop0
for i in range(1, 7):
    df["pop{}".format(i)] /= df["pop0"]
    df["nat_pop{}".format(i)] /= df["pop0"]
df["nat_pop0"] /= df["pop0"]
df["pop0"] = 1

# compute distance matrix
def computeDistanceIndexes(i, j, df):
    rowi = df[df.index == i]
    rowj = df[df.index == j]
    return np.sum(np.array([(rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])**2 for index in range(7) ]))
    
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))

def computeDistance(X):
    return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))


# compute distance matrix
def computeDistanceIndexes2(i, j, df):
    rowi = df[df.index == i]
    rowj = df[df.index == j]
    return np.sum(np.array([((rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])/rowi["nat_pop{}".format(index)].values[0]-rowj["nat_pop{}".format(index)].values[0])**2 for index in range(7) ]))
    
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))

def computeDistance2(X):
    return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))


names = ["pop{}".format(i) for i in range(7)]
nclusters = 10
"""# cluster neighbourhoods using KMeans
kmeans = cluster.KMeans(n_clusters=nclusters, random_state=0).fit(df[names])
print kmeans.labels_.astype(int)

# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
"""
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance2, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)

df_write["clustering_pop"] = -1
df_write.loc[1:,"clustering_pop"] = kmeans.labels_.astype(int)

df_write.to_csv("clustering_neighbourhoods.csv", sep=';', index=False)


[1 1 1 1 1 1 1 1 1 0 0 5 1 1 4 3 3 3 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 1 0 1 7
 0 1 3 2 3 1 0 0 0 1 1 0 6 0 1 1 3 2 1 1 1 1 1 1 1 9 1 4 8 1 0 3 1 1 1 1]

Cluster districts according to the evolution of the population


In [15]:
import pandas as pd
from sklearn import cluster

# load data
df = pd.read_csv("clustering_district.csv", sep=';')
df_write = pd.read_csv("clustering_district.csv", sep=';')
df = df[df["district"] != "year"]        

# normalize populations dividing by the pop# and nat_pop# fields by the value stored in pop0
for i in range(1, 7):
    df["pop{}".format(i)] /= df["pop0"]
    df["nat_pop{}".format(i)] /= df["pop0"]
df["nat_pop0"] /= df["pop0"]
df["pop0"] = 1

# compute distance matrix
def computeDistanceIndexes(i, j, df):
    rowi = df[df.index == i]
    rowj = df[df.index == j]
    return np.sum(np.array([(rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])**2 for index in range(7) ]))
    
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))

def computeDistance(X):
    return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))


# compute distance matrix
def computeDistanceIndexes2(i, j, df):
    rowi = df[df.index == i]
    rowj = df[df.index == j]
    return np.sum(np.array([((rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])/rowi["nat_pop{}".format(index)].values[0]-rowj["nat_pop{}".format(index)].values[0])**2 for index in range(7) ]))
    
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))

def computeDistance2(X):
    return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))

names = ["pop{}".format(i) for i in range(7)]
nclusters = 4
"""# cluster neighbourhoods using KMeans
kmeans = cluster.KMeans(n_clusters=nclusters, random_state=0).fit(df[names])
print kmeans.labels_.astype(int)

# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
"""
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance2, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)

df_write["clustering_pop"] = -1
df_write.loc[1:,"clustering_pop"] = kmeans.labels_.astype(int)

df_write.to_csv("clustering_district.csv", sep=';', index=False)

In [ ]: