In [1]:
from src.algorithms.slp import SLP
from src.utils.geo import haversine
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0, 10.0)

In [3]:
# Options:
# num_iters: Controls the number of iterations the SLP model performs
# hold_out: User Ids [user.id_str] that end in the character[s] to not include in training... will be used for test
# json_path: path to the json format
options = {'num_iters':5, 'hold_out':set(['9']), 'json_path':'/local/path/to/format/twitter_format.json'}
slp = SLP(sc, sqlCtx, options)

In [4]:
# Train Spaital Label Propagation [Note: iteration times suffer from lazy evaluation]
slp.train('hdfs:///datasets/twitter/2015/*/*/*/*')


Building edge list
Finding known user locations
Filtering out user locations that end in: 9
Building a filtered edge list
Begining iterations
Completed iteration:  0  in  0.0883619785309
Completed iteration:  1  in  0.306536912918
Completed iteration:  2  in  0.250301122665
Completed iteration:  3  in  0.29673409462
Completed iteration:  4  in  0.313722133636

In [5]:
# Save so we don't have to retrain for future testing
slp.save('/local/path/to/save/slp.test.pkl')

In [7]:
slp.test('hdfs:///datasets/twitter/2015/*/*/*/*')


Number of Found Locations:  2076
('Median Error', 34.522314717610115)
('Mean Error: ', 994.39026311571831)
Out[7]:
{'coverage': 0.1081193687828759,
 'data_path': 'hdfs:///datasets/twitter/2015/*/*/*/*',
 'mean': 994.39026311571831,
 'median': 34.522314717610115,
 'options': {'hold_out': {'9'},
  'json_path': '/path/to/format/twitter_format.json',
  'num_iters': 5}}