In [1]:
from src.algorithms.slp import SLP
from src.utils.geo import haversine
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0, 10.0)
In [3]:
# Options:
# num_iters: Controls the number of iterations the SLP model performs
# hold_out: User Ids [user.id_str] that end in the character[s] to not include in training... will be used for test
# json_path: path to the json format
options = {'num_iters':5, 'hold_out':set(['9']), 'json_path':'/local/path/to/format/twitter_format.json'}
slp = SLP(sc, sqlCtx, options)
In [4]:
# Train Spaital Label Propagation [Note: iteration times suffer from lazy evaluation]
slp.train('hdfs:///datasets/twitter/2015/*/*/*/*')
In [5]:
# Save so we don't have to retrain for future testing
slp.save('/local/path/to/save/slp.test.pkl')
In [7]:
slp.test('hdfs:///datasets/twitter/2015/*/*/*/*')
Out[7]: