In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
In [8]:
filename = "../datasets/dades_oficials_ajBCN/ajuntament_bcn_formatted_data.csv"
df = pd.read_csv(filename, sep=';', encoding='utf-8', decimal=',')
In [10]:
%matplotlib inline
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
save = open("clustering_neighbourhoods.csv", 'w')
save.write("neighbourhood;")
save.write("pop0;pop1;pop2;pop3;pop4;pop5;pop6;")
save.write("nat_pop0;nat_pop1;nat_pop2;nat_pop3;nat_pop4;nat_pop5;nat_pop6\n")
save.write("year;")
save.write("2010;2011;2012;2013;2014;2015;2016;")
save.write("2010;2011;2012;2013;2014;2015;2016\n")
# print evolution of population for neighbourhood
barris = np.unique(df["Barri"])
for barri in barris:
evo_pop_df = df[(df["Barri"] == barri) & (df["Any"] >= 2010)].sort_values(by="Any")
# get population and year
year = np.array(evo_pop_df["Any"])
population = np.array(evo_pop_df[u'Població'])
# get natural evolution of the population
population_natural_variation = np.copy(population)
for now in range(1, population_natural_variation.size):
then = now - 1
add_people = evo_pop_df[evo_pop_df["Any"] == year[then]][u'Naixements Total'].astype(float)
sub_people = evo_pop_df[evo_pop_df["Any"] == year[then]][u'Defuncions Total'].astype(float)
population_natural_variation[now] = population_natural_variation[then] + add_people - sub_people
fig = plt.figure(figsize=(9,9))
gs = gridspec.GridSpec(1, 1)
gs.update(left=0.2, bottom=0.15)
ax = fig.add_subplot(gs[0])
ax.set_title(barri, fontsize=30)
ax.set_ylabel(u"Població", fontsize=25)
ax.set_xlabel(u"Any", fontsize=25)
ax.tick_params(which='both', labelsize=15, pad=10, size=10)
ax.plot(year, population, label=u'evolució real')
ax.plot(year, population_natural_variation, label=u'evolució natural')
ax.legend(numpoints=1, prop={'size':18}, frameon=False, loc=1)
fig.savefig(u"{}_evolucio_poblacio.png".format(barri.replace(".","_").strip()))
save.write("{};".format(barri))
save.write(";".join(population.astype(str)))
save.write(";")
save.write(";".join(population_natural_variation.astype(str)))
save.write("\n")
save.close()
In [7]:
%matplotlib inline
import sys
reload(sys)
sys.setdefaultencoding('utf8')
save = open("clustering_district.csv", 'w')
save.write("district;")
save.write("pop0;pop1;pop2;pop3;pop4;pop5;pop6;")
save.write("nat_pop0;nat_pop1;nat_pop2;nat_pop3;nat_pop4;nat_pop5;nat_pop6\n")
save.write("year;")
save.write("2010;2011;2012;2013;2014;2015;2016;")
save.write("2010;2011;2012;2013;2014;2015;2016\n")
# print evolution of population for districts
dtes = np.unique(df["Dte."])
for dte in dtes:
evo_pop_df = df[(df["Dte."] == dte) & (df["Any"] >= 2010)].sort_values(by="Any").groupby("Any").sum()
# get population and year
year = np.array(evo_pop_df.index)
population = np.array(evo_pop_df[u'Població'])
# get natural evolution of the population
population_natural_variation = np.copy(population)
for now in range(1, population_natural_variation.size):
then = now - 1
add_people = evo_pop_df[evo_pop_df.index == year[then]][u'Naixements Total'].astype(float)
sub_people = evo_pop_df[evo_pop_df.index == year[then]][u'Defuncions Total'].astype(float)
population_natural_variation[now] = population_natural_variation[then] + add_people - sub_people
fig = plt.figure(figsize=(9,9))
gs = gridspec.GridSpec(1, 1)
gs.update(left=0.2, bottom=0.15)
ax = fig.add_subplot(gs[0])
ax.set_title("Dte. {}".format(dte), fontsize=30)
ax.set_ylabel(u"Població", fontsize=25)
ax.set_xlabel(u"Any", fontsize=25)
ax.tick_params(which='both', labelsize=15, pad=10, size=10)
ax.plot(year, population, label=u'evolució real')
ax.plot(year, population_natural_variation, label=u'evolució natural')
ax.legend(numpoints=1, prop={'size':18}, frameon=False, loc=1)
fig.savefig(u"dte{}_evolucio_poblacio.png".format(dte))
save.write("{};".format(dte))
save.write(";".join(population.astype(str)))
save.write(";")
save.write(";".join(population_natural_variation.astype(str)))
save.write("\n")
save.close()
In [3]:
import pandas as pd
from sklearn import cluster
# load data
df = pd.read_csv("clustering_neighbourhoods.csv", sep=';')
df_write = pd.read_csv("clustering_neighbourhoods.csv", sep=';')
df = df[df["neighbourhood"] != "year"]
# normalize populations dividing by the pop# and nat_pop# fields by the value stored in pop0
for i in range(1, 7):
df["pop{}".format(i)] /= df["pop0"]
df["nat_pop{}".format(i)] /= df["pop0"]
df["nat_pop0"] /= df["pop0"]
df["pop0"] = 1
# compute distance matrix
def computeDistanceIndexes(i, j, df):
rowi = df[df.index == i]
rowj = df[df.index == j]
return np.sum(np.array([(rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])**2 for index in range(7) ]))
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
def computeDistance(X):
return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
# compute distance matrix
def computeDistanceIndexes2(i, j, df):
rowi = df[df.index == i]
rowj = df[df.index == j]
return np.sum(np.array([((rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])/rowi["nat_pop{}".format(index)].values[0]-rowj["nat_pop{}".format(index)].values[0])**2 for index in range(7) ]))
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
def computeDistance2(X):
return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
names = ["pop{}".format(i) for i in range(7)]
nclusters = 10
"""# cluster neighbourhoods using KMeans
kmeans = cluster.KMeans(n_clusters=nclusters, random_state=0).fit(df[names])
print kmeans.labels_.astype(int)
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
"""
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance2, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
df_write["clustering_pop"] = -1
df_write.loc[1:,"clustering_pop"] = kmeans.labels_.astype(int)
df_write.to_csv("clustering_neighbourhoods.csv", sep=';', index=False)
In [15]:
import pandas as pd
from sklearn import cluster
# load data
df = pd.read_csv("clustering_district.csv", sep=';')
df_write = pd.read_csv("clustering_district.csv", sep=';')
df = df[df["district"] != "year"]
# normalize populations dividing by the pop# and nat_pop# fields by the value stored in pop0
for i in range(1, 7):
df["pop{}".format(i)] /= df["pop0"]
df["nat_pop{}".format(i)] /= df["pop0"]
df["nat_pop0"] /= df["pop0"]
df["pop0"] = 1
# compute distance matrix
def computeDistanceIndexes(i, j, df):
rowi = df[df.index == i]
rowj = df[df.index == j]
return np.sum(np.array([(rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])**2 for index in range(7) ]))
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
def computeDistance(X):
return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
# compute distance matrix
def computeDistanceIndexes2(i, j, df):
rowi = df[df.index == i]
rowj = df[df.index == j]
return np.sum(np.array([((rowi["pop{}".format(index)].values[0]-rowj["pop{}".format(index)].values[0])/rowi["nat_pop{}".format(index)].values[0]-rowj["nat_pop{}".format(index)].values[0])**2 for index in range(7) ]))
#distanceMatrix = np.array([computeDistance(i,j, df) for i in range(1, df.shape[0] + 1) for j in range (1, df.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
def computeDistance2(X):
return np.array([computeDistanceIndexes(i,j, df) for i in range(1, X.shape[0] + 1) for j in range (1, X.shape[0] + 1)]).reshape((df.shape[0],df.shape[0]))
names = ["pop{}".format(i) for i in range(7)]
nclusters = 4
"""# cluster neighbourhoods using KMeans
kmeans = cluster.KMeans(n_clusters=nclusters, random_state=0).fit(df[names])
print kmeans.labels_.astype(int)
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
"""
# cluster neighbourhoods using AgglomerativeClustering
names = ["pop{}".format(i) for i in range(7)]
kmeans = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity=computeDistance2, linkage="average").fit(df[names])
print kmeans.labels_.astype(int)
df_write["clustering_pop"] = -1
df_write.loc[1:,"clustering_pop"] = kmeans.labels_.astype(int)
df_write.to_csv("clustering_district.csv", sep=';', index=False)
In [ ]: