In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Make sure the encoding is utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# Will be used to address character encoding later due to French names
from pandas.compat import u
In [2]:
%pdb off
In [3]:
flowData = pd.read_csv('../TableD_01110030-eng.csv')
flowData.head()
Out[3]:
In [4]:
# Convert place names to unicode
flowData['GEO'] = flowData['GEO'].map(u)
flowData['GEODEST'] = flowData['GEODEST'].map(u)
flowData.head()
Out[4]:
In [5]:
# Remove unneeded columns
dropCols = ['Geographical classification',
'Geographical classification.1',
'Coordinate',
'Vector']
flowData = flowData.drop(dropCols, axis=1)
flowData.head(10)
Out[5]:
In [6]:
# Rename columns
flowData = flowData.rename(columns={"GEO": "Origin", "GEODEST": "Destination"})
In [7]:
# Filter for only the most recent data
flowData2011 = flowData[flowData['Ref_Date'] == 2011].drop('Ref_Date', axis=1).reset_index(drop=True)
flowData2011.head()
Out[7]:
In [8]:
# Convert that Value column to a numeric data type
flowData2011['Value'] = flowData2011['Value'].convert_objects(convert_numeric=True)
In [9]:
# Remove all the non-census areas so we can geocode the cities that qualify as CMAs
flowData2011_cma = flowData2011[~flowData2011['Destination'].str.contains('Non-census')]
flowData2011_cma = flowData2011_cma[~flowData2011_cma['Origin'].str.contains('Non-census')]
flowData2011_cma.head()
Out[9]:
In [10]:
outMig = flowData2011_cma[flowData2011_cma['MIGMOVE'] == "Out-migration"].drop('MIGMOVE', axis=1).reset_index(drop=True)
outMig.head()
Out[10]:
In [11]:
outMigPiv = outMig.pivot('Origin', 'Destination', 'Value')
outMigPiv.head()
Out[11]:
In [12]:
# Since there is such a range in values, let's put this on a log scale
log_scale = lambda x: np.log10(x)
outMigPivLog = outMigPiv.applymap(log_scale).replace([np.inf, -np.inf], 0)
In [13]:
sns.heatmap(outMigPivLog)
Out[13]:
Source: Geocoder.ca
In [45]:
# Get mapping of cities to centroids
centroids = pd.read_csv('./canada_cities.csv', header=None, names=['Location', 'Province', 'Latitude', 'Longitude'])
In [46]:
from titlecase import titlecase
title_u = lambda x: u(x).title()
In [47]:
centroids['Location'] = centroids['Location'].map(title_u)
centroids.head(15)
Out[47]:
In [51]:
provAbbr = {'BC' : 'British Columbia',
'SK' : 'Saskatchewan',
'QC' : 'Quebec',
'AB' : 'Alberta',
'NB' : 'New Brunswick',
'NS' : 'Nova Scotia',
'ON' : 'Ontario',
'NL' : 'Newfoundland',
'PE' : 'PEI',
'MB' : 'Manitoba',
'NT' : 'Northwest Territories',
'YT' : 'Yukon',
'NU' : 'Nunavut'}
centroids['Province'] = centroids['Province'].replace(provAbbr)
centroids.head()
Out[51]:
In [55]:
centroids = centroids.drop_duplicates(subset=['Location', 'Province'])
centroids.head()
Out[55]:
In [ ]: