In [21]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('input/data_en.csv',
                 parse_dates=['Submission Date'],
                 dtype={
                     'Price': np.float64,
                     'Quantity (grams)': np.float64
                 })

In [3]:
df.head()


Out[3]:
Submission Date Price Quantity (grams) Quality City (Census subdivision) Province Primary Reason Usage Average Monthly Consumption (grams)
0 2018-01-26 170.0 28.0 High conception bay south NL Medicate without a medical document Daily 28.0
1 2018-01-02 85.0 28.0 High portugal cove-st. philip's NL Use recreationally Daily 112.0
2 2018-01-02 20.0 3.5 High portugal cove-st. philip's NL Use recreationally Daily 0.0
3 2018-01-25 15.0 1.0 Medium torbay NL Medicate with a medical document A few times per week 28.0
4 2018-02-02 15.0 1.0 High torbay NL Use recreationally A few times per week 28.0

In [4]:
def price_per_gram(price, grams):
    if price == 0 or grams == 0:
        return np.nan
    
    return price / grams

df['Price Per Gram'] = df.apply(
    lambda row: price_per_gram(row['Price'], row['Quantity (grams)']), axis=1)

In [5]:
df.tail()


Out[5]:
Submission Date Price Quantity (grams) Quality City (Census subdivision) Province Primary Reason Usage Average Monthly Consumption (grams) Price Per Gram
15610 2018-01-31 150.0 14.0 High yellowknife NT Use recreationally A few times per week 0.5 10.714286
15611 2018-02-02 10.0 1.0 High yellowknife NT Medicate without a medical document NaN 0.0 10.000000
15612 2018-01-30 100.0 7.0 Medium iqaluit NU Use recreationally Daily 28.0 14.285714
15613 2018-01-30 320.0 28.0 Medium iqaluit NU Use recreationally Daily 14.0 11.428571
15614 2018-01-25 35.0 7.0 Medium kugluktuk NU Use recreationally NaN 0.0 5.000000

In [6]:
df_groupby_province = df.groupby(['Province'])['Price Per Gram'].mean()

df_groupby_province


Out[6]:
Province
AB     7.258903
BC     6.955510
MB     7.235913
NB     6.381788
NL     7.896240
NS     7.117300
NT    11.886364
NU    10.238095
ON     7.324025
PE     7.029095
QC     5.877720
SK     7.440028
YT     8.054545
Name: Price Per Gram, dtype: float64

In [7]:
df_groupby_city = df.groupby(['Province', 'City (Census subdivision)'])['Price Per Gram'].mean()
df_groupby_city.sort_values()


Out[7]:
Province  City (Census subdivision)
ON        brant                         2.000000
QC        sainte-anne-des-plaines       2.500000
BC        salmo                         2.857143
QC        kazabazua                     2.857143
NB        richibucto                    2.857143
QC        saint-michel-du-squatec       2.857143
          east angus                    2.857143
          rivi�re-rouge                 2.857143
          sainte-victoire-de-sorel      2.857143
          saint-andr�-d'argenteuil      2.857143
          havelock                      2.857143
NB        hillsborough                  2.966667
QC        val-alain                     3.000000
          saint-mathieu-de-beloeil      3.214286
BC        nakusp                        3.392857
ON        strathroy-caradoc             3.455357
QC        val-des-bois                  3.570000
          pontiac                       3.571429
BC        new hazelton                  3.571429
QC        saint-�phrem-de-beauce        3.571429
          cap-saint-ignace              3.571429
          saint-barth�lemy              3.571429
ON        huron-kinloss                 3.571429
          brighton                      3.571429
BC        princeton                     3.571429
AB        thorhild county               3.571429
QC        richmond                      3.571429
          saint-r�mi                    3.571429
          nicolet                       3.571429
          val-morin                     3.571429
                                         ...    
          petit-saguenay               12.500000
ON        oneida 41                    12.571429
SK        lac la ronge 156             12.589286
NU        iqaluit                      12.857143
ON        dryden                       12.857143
          renfrew                      13.571429
QC        saints-martyrs-canadiens     13.571429
NB        new bandon                   13.571429
NS        inverness, subd. a           14.000000
BC        alert bay                    14.000000
SK        one arrow 95-1a              14.000000
QC        les escoumins                14.285714
BC        boston bar 1a                14.285714
QC        price                        15.000000
AB        valleyview                   15.000000
SK        martensville                 15.000000
NT        hay river                    15.595238
          fort simpson                 17.142857
YT        dawson                       18.035714
NL        st. alban's                  20.000000
QC        natashquan                   20.000000
ON        south river                  20.000000
NL        fogo island                  20.000000
AB        chipewyan 201a               20.000000
BC        skeena-queen charlotte a     20.000000
QC        kawawachikamach              20.000000
ON        moose factory 68             20.000000
QC        manawan                      20.000000
ON        moosonee                     20.000000
QC        obedjiwan                    20.000000
Name: Price Per Gram, Length: 735, dtype: float64

In [8]:
df_groupby_city.to_csv('output/data-group-by-city.csv')

In [9]:
from mapbox import Geocoder

from keys import MAPBOX_ACCESS_TOKEN

geocoder = Geocoder(access_token=MAPBOX_ACCESS_TOKEN)

In [10]:
r = geocoder.forward('moosonee, QC', limit=1, country=['ca'])

data = r.json()

In [11]:
lat, lng = data['features'][0]['geometry']['coordinates']

In [13]:
cache = {}

In [17]:
def city_to_coordinates(city, province):
    if cache.get('{0}, {1}'.format(city, province)):
        return cache.get('{0}, {1}'.format(city, province))
    
    resp = geocoder.forward('{0}, {1}'.format(city, province), limit=1, country=['ca'])
    
    if not resp.ok:
        return np.nan
    
    try:
        cache['{0}, {1}'.format(city, province)] = ['features'][0]['geometry']['coordinates']
        return ['features'][0]['geometry']['coordinates']
    except:
        return np.nan
    

df['Coordinates'] = df.apply(
    lambda row: city_to_coordinates(
            row['City (Census subdivision)'], row['Province']), axis=1)

In [18]:
df.head()


Out[18]:
Submission Date Price Quantity (grams) Quality City (Census subdivision) Province Primary Reason Usage Average Monthly Consumption (grams) Price Per Gram Coordinates
0 2018-01-26 170.0 28.0 High conception bay south NL Medicate without a medical document Daily 28.0 6.071429 NaN
1 2018-01-02 85.0 28.0 High portugal cove-st. philip's NL Use recreationally Daily 112.0 3.035714 NaN
2 2018-01-02 20.0 3.5 High portugal cove-st. philip's NL Use recreationally Daily 0.0 5.714286 NaN
3 2018-01-25 15.0 1.0 Medium torbay NL Medicate with a medical document A few times per week 28.0 15.000000 NaN
4 2018-02-02 15.0 1.0 High torbay NL Use recreationally A few times per week 28.0 15.000000 NaN

In [24]:
df[['Latitude', 'Longitude']] = df.apply(lambda row: row['Coordinates'])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-24-5c725cbc8724> in <module>()
----> 1 df[['Latitude', 'Longitude']] = df.apply(lambda row: row['Coordinates'])

/usr/local/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4875                         f, axis,
   4876                         reduce=reduce,
-> 4877                         ignore_failures=ignore_failures)
   4878             else:
   4879                 return self._apply_broadcast(f, axis)

/usr/local/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4971             try:
   4972                 for i, v in enumerate(series_gen):
-> 4973                     results[i] = func(v)
   4974                     keys.append(v.name)
   4975             except Exception as e:

<ipython-input-24-5c725cbc8724> in <lambda>(row)
----> 1 df[['Latitude', 'Longitude']] = df.apply(lambda row: row['Coordinates'])

/usr/local/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    621         key = com._apply_if_callable(key, self)
    622         try:
--> 623             result = self.index.get_value(self, key)
    624 
    625             if not is_scalar(result):

/usr/local/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   2558         try:
   2559             return self._engine.get_value(s, k,
-> 2560                                           tz=getattr(series.dtype, 'tz', None))
   2561         except KeyError as e1:
   2562             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

KeyError: ('Coordinates', 'occurred at index Submission Date')

In [ ]:
df.plot(kind='scatter', x='Longitude', y='Latitude', alpha=0.4)
plt.show()