In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style(rc={'font.family': ['sans-serif'],'axis.labelsize': 25})
sns.set_context("notebook")
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['axes.labelsize'] = 18
In this post I will attempt to combine two datasets I have worked on. Firstly the Average driving speeds by country as estimated by Google's location API, and secondly the GDP by country using data from the world bank.
Firstly we import the data as two separate data frames, this assumes the data exists as per the other posts and also does the cleaning without explanation.
In [7]:
df_GDP = pd.read_csv("../data_sets/GDP_by_Country_WorldBank/ny.gdp.mktp.cd_Indicator_en_csv_v2.csv",
quotechar='"', skiprows=2)
colnames_to_drop = df_GDP.columns[np.array([2, 3, -2, -1])]
for c in colnames_to_drop:
df_GDP.drop(c, 1, inplace=True)
df_GDP = df_GDP[~df_GDP['Country Code'].isnull()]
df_AS = pd.read_csv("AverageSpeedsByCountry.txt", skipinitialspace=True)
Unfortunately the two data sets can't be coerced together easily. The reason being that the country codes used in the average speed investigation were taken from the geonames website which used a two letter code, while the country codes for the GDP used the proper three letter code. Thankfully most of the codes can be matched by simply pairing the first two letters of the 3 letter code (e.g "USA" with "US"), we first create all of these pairs:
In [20]:
pairs = []
for Country in df_AS.Country:
matches = [Country in CC[:2] for CC in df_GDP['Country Code'].values]
matched_values = df_GDP['Country Code'][matches].values
if len(matched_values) == 1:
pairs.append([Country, matched_values[0]])
elif len(matched_values) > 1:
print "For {} I found these matches:".format(Country), " ".join(matched_values)
else:
print "No matches found for {}".format(Country)
For those with multiple matches, some we can easily add in by hand:
In [21]:
pairs_by_hand = [['BR', 'BRA'],
['CA', 'CAN'],
['FR', 'FRA'],
['AU', 'AUS'],
['AR', 'ARG'],
['IN', 'IND']]
for pair in pairs_by_hand:
pairs.append(pair)
In [40]:
from matplotlib.text import TextPath
ax = plt.subplot(111)
for [AveSpeedCC, GDPCC] in pairs:
GDP = df_GDP[df_GDP['Country Code'] == GDPCC]['2013'].values[0]
AveS = df_AS[df_AS.Country == AveSpeedCC].Ave.values
#ax.scatter(AveS, GDP, c="r", marker=TextPath((0, 0), AveSpeedCC, size=10000), s=1000)
m = r"$\mathrm{{{}}}$".format(AveSpeedCC)
ax.plot(AveS, GDP, marker=m, markersize=20)
ax.set_yscale("log")
ax.set_ylabel("GDP")
ax.set_xlabel("Average Speed")
plt.show()
In [ ]: