In [1]:
from src.algorithms.gmm import GMM
from src.utils.geo import haversine
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
# Import GMM Model Code
from src.algorithms.gmm import GMM
import math
# Make our plots default to larger
plt.rcParams['figure.figsize'] = (12.0, 10.0)
In [2]:
# Load test data
temp_table_name = 'tweets'
test_data_path = 'hdfs:///path/to/test/data/*'
In [3]:
g = GMM(sc, sqlCtx, {'fields':set(['user.location', 'text']), 'json_path':'/local/path/to/twitter_format.json'})
# Train Model [This takes a while so make sure to save it]
#g.train('hdfs:///path/to/training/data')
#g.save('/local/path/to/pretrained_model.pkl')
# Load pretrained model
g.load('/local/path/to/pretrained_model.pkl')
In [29]:
g.test(test_data_path)
Out[29]:
In [ ]:
# Create a temporary table in this context which allows us to explore interatively
all_tweets = sqlCtx.parquetFile(test_data_path)
all_tweets.cache()
all_tweets.registerTempTable(temp_table_name)
In [28]:
# NOTE: This where clause filters out US geo coorindates
#where_clause = "lang = 'en' and geo.coordinates is not null and user.location is not null"
where_clause = "geo.coordinates is not null and user.location is not null"
limit = 10000
sql_stmt = 'select * from %s where %s limit %d'%(temp_table_name, where_clause, limit)
print 'Query Executed:', sql_stmt
In [29]:
# Peform query and pull to local context
# TODO: Consider switching to sampling
local_tweets = sqlCtx.sql(sql_stmt).collect()
print 'Pulled %d tweets into local context'%len(local_tweets)
In [6]:
def print_tweet(tweet):
print
print 'Text:', tweet.text
print 'User Specified Location:', tweet.user.location
print 'Location:', tweet.geo.coordinates
In [7]:
# Temporary block of code until the new gmm models are run
####TODO REMOVE THIS when Re-Run!!!!!!!
from sklearn import mixture
def combine_gmms(gmms):
""" Takes an array of gaussian mixture models and produces a GMM that is the weighted sum of the models"""
n_components = sum([g[0].n_components for g in gmms])
covariance_type = gmms[0][0].covariance_type
new_gmm = mixture.GMM(n_components=n_components, covariance_type=covariance_type)
new_gmm.means_ = np.concatenate([g[0].means_ for g in gmms])
new_gmm.covars_ = np.concatenate([g[0].covars_ for g in gmms])
weights = np.concatenate([g[0].weights_ * ((1/g[1])**4) for g in gmms])
new_gmm.weights_ = weights / np.sum(weights) # Normalize weights
new_gmm.converged_ = True
return new_gmm
In [8]:
def get_gmm_info(inputRow):
(location, tokens) = GMM.tokenize(inputRow, fields=g.options['fields'])
true_lat, true_lon = location
models = []
for token in tokens:
if token in g.model:
models.append(g.model[token])
#min_error = None
errors = [m[1] for m in models]
if len(errors) > 0:
min_error = min(errors)
else:
min_error = None
if len(models) > 1:
combined_gmm = combine_gmms(models)
elif len(models) == 1:
combined_gmm = models[0][0]
else:
np.nan
#return (None, np.nan, np.nan, None)
(best_lat, best_lon) = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
distance = haversine(best_lon, best_lat, true_lon, true_lat)
return ((best_lat, best_lon), min_error, distance, combined_gmm)
In [9]:
def plot_gmm(gmm_model, true_ll=None, percent=None):
plt.figure()
y = np.linspace(90,-90, num=180)
x = np.linspace(-180,180, num=360)
X, Y = np.meshgrid(y,x)
XX = np.array([X.ravel(), Y.ravel()]).T
#Z = np.log(-gmm_model.score_samples(XX)[0]+1)
Z = -gmm_model.score_samples(XX)[0]
if percent:
Z = np.exp2(gmm_model.score_samples(XX)[0])
target = np.sum(Z)*percent
z_sorted = sorted(Z, reverse=True)
i = 0
sum = 0
while sum < target:
sum += z_sorted[i]
i += 1
print 'Percent of the world:', float(XX[Z > z_sorted[i]].shape[0])/XX.shape[0]*100
Z[Z < z_sorted[i]] = 0
Z = -np.log(Z)
#Z = Z.reshape(X.shape)
Z = Z.reshape(X.shape)
m = Basemap(projection='mill', lon_0=0)#,lon_0=0.5*(lons[0]+lons[-1]))
m.drawcountries()
m.drawcoastlines()
m.drawparallels(np.arange(-90.,120.,30.),labels=[1,0,0,0]) # draw parallels
m.drawmeridians(np.arange(0.,420.,60.),labels=[0,0,0,1]) # draw meridians
#m.drawcounties()
#cmap = mpl.cm.RdYlBu
cmap = mpl.cm.pink
norm = mpl.colors.Normalize(vmin=0, vmax = 5)
X, Y = m(Y,X)
CS = m.contourf(X, Y, Z, 25,linewidths=1.5, cmap=cmap)#, norm=norm)
#CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
# levels=np.logspace(0, 3, 20))
CB = plt.colorbar(CS, shrink=0.8, extend='both')
# Plot estimated location
(best_lat, best_lon) = gmm_model.means_[np.argmax(gmm_model.weights_)]
best_lat, best_lon = m(best_lon, best_lat)
plt.plot(best_lat, best_lon, '*g')
if true_ll:
lat, lon = m(true_ll[1], true_ll[0])
plt.plot(lat, lon, '*b')
#lon, lat = gmm_model.means_[i]
for i in range (0, gmm_model.n_components):
lat, lon = gmm_model.means_[i]
weight = gmm_model.weights_[i]
x, y = m(lon, lat)
#plt.plot(x, y, 'or')#, markersize=6*weight)
plt.show()
In [10]:
def plot_gmm_w_percentage(gmm_model, percent):
y = np.linspace(90,-90, num=180)
x = np.linspace(-180,180, num=360)
X, Y = np.meshgrid(y,x)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = np.exp(gmm_model.score_samples(XX)[0])
target = np.sum(Z)*percent
z_sorted = sorted(Z, reverse=True)
i = 0
sum = 0
while sum < target:
sum += z_sorted[i]
i += 1
print 'Percent of the world:', float(XX[Z > z_sorted[i]].shape[0])/XX.shape[0]*100
Z[Z < z_sorted[i]] = 0
Z = -np.log(Z)
Z = Z.reshape(X.shape)
m = Basemap(projection='mill', lon_0=0)#,lon_0=0.5*(lons[0]+lons[-1]))
m.drawcountries()
m.drawcoastlines()
m.drawparallels(np.arange(-90.,120.,30.),labels=[1,0,0,0]) # draw parallels
m.drawmeridians(np.arange(0.,420.,60.),labels=[0,0,0,1]) # draw meridians
#m.drawcounties()
#cmap = mpl.cm.RdYlBu
cmap = mpl.cm.pink
norm = mpl.colors.Normalize(vmin=0, vmax = 5)
X, Y = m(Y,X)
CS = m.contourf(X, Y, Z, 25,linewidths=1.5, cmap=cmap)#, norm=norm)
#CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
# levels=np.logspace(0, 3, 20))
CB = plt.colorbar(CS, shrink=0.8, extend='both')
plt.show()
In [ ]:
# Created a combined GMM for a tweet, score GMM performance and create a contour plot
def plot_row(inputRow):
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(inputRow)
print_tweet(inputRow)
print 'Estimated Location:', est_location
print 'Error (km):', error_distance
plot_gmm(combined_gmm, true_ll=inputRow.geo.coordinates)
In [15]:
plot_row(local_tweets[0])
In [56]:
plot_row(local_tweets[6])
In [ ]:
# Print all the tweets we've pulled into the local context
for i, t in enumerate(local_tweets):
print i, t.text
In [37]:
# Compute local array of actual error and min training error
min_errors = []
actual_errors = []
skipped = 0
for i in range(len(local_tweets)):
try:
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(local_tweets[i])
if min_error != 0 and error_distance !=0:
min_errors.append(min_error)
actual_errors.append(error_distance)
except:
skipped += 1
print 'Skipped %d of %d'%(skipped, len(local_tweets))
X-axis [km]: The error of the word with the minimum error in the training set. Error in the training set is defined as the median distance between the most likely point and every occurrence of that word in the training data Y-axis [km]: The distance between the most likely point and the actual point The second plot is a log-log plot with the same axes
In [49]:
plt.figure()
#plt.plot(np.log(min_errors), np.log(actual_errors), '.')
plt.plot(min_errors, actual_errors, '.')
plt.axis([0,3000,0,3000])
#plt.axis([0,np.log(3000),0,np.log(3000)])
#print min(actual_errors)
from scipy.stats import pearsonr
print pearsonr(min_errors, actual_errors)
print pearsonr(np.log(min_errors), np.log(actual_errors))
plt.figure()
plt.plot(np.log(min_errors), np.log(actual_errors), '.')
plt.axis([0,np.log(3000),0,np.log(3000)])
Out[49]:
In [173]:
percent_of_mass_to_include = 0.8
tweet = local_tweets[85]
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(tweet)
print_tweet(tweet)
print 'Estimated Location:', est_location
print 'Error (km):', error_distance
plot_gmm(combined_gmm, true_ll=tweet.geo.coordinates, percent=percent_of_mass_to_include)
In [ ]:
(est_location, min_error, error_distance, gmm_model) = get_gmm_info(local_tweets[5])
In [66]:
y = np.linspace(90,-90, num=180*4)
x = np.linspace(-180,180, num=360*4)
X, Y = np.meshgrid(y,x)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = np.exp(model_the[0].score_samples(XX)[0])
In [67]:
print Z.shape
print np.sum(Z)*.25*.25
In [61]:
model_the = g.model['nyc']
model_the[0].weights_
Out[61]:
In [51]:
np.sum(model_the[0].weights_)
Out[51]:
In [45]:
len(model_the[0].weights_)
Out[45]:
In [46]:
b = model_the[0].score_samples(XX)
In [47]:
b[1].shape
Out[47]:
In [48]:
b[1][1]
Out[48]:
In [49]:
print max(b[1][1])
print sum(b[1][1])
In [14]:
import statsmodels.sandbox.distributions.extras as ext
#documented at http://statsmodels.sourceforge.net/devel/_modules/statsmodels/sandbox/distributions/extras.html#mvnormcdf
In [15]:
def prob_mass(gmm_model, upper_bound, lower_bound):
total_prob = 0
for i in range(0, len(gmm_model.weights_)):
val = ext.mvnormcdf(upper_bound, gmm_model.means_[i], gmm_model.covars_[i], lower_bound, maxpts=2000)
# below is necessary as a very rare occurance causes some guassians to have a result of nan
#(likely exeedingly low probability)
if math.isnan(val):
pass
else:
weighted_val = val * gmm_model.weights_[i]
total_prob += weighted_val
return total_prob
In [18]:
#good test is to set upper and lower limits to extent of globe and see if it approximates 1
#coordinates should go
upper = [90,180]
lower = [-90,-180]
print prob_mass(g.model['nyc'][0], upper, lower)
print prob_mass(g.model['the'][0], upper, lower)
In [19]:
#mass of probability within ~10 km of the center point (0.1 degrees is 11.132 KM N/S)
gmm_model = g.model['nyc'][0]
distance = 10
distance_deg = distance/111.32
best_point = gmm_model.means_[np.argmax(gmm_model.weights_)]
print best_point, distance_deg
upper = [best_point[0]+distance_deg,best_point[1]+distance_deg]
lower = [best_point[0]-distance_deg,best_point[1]-distance_deg]
print gmm_model, upper, lower
print prob_mass(gmm_model, upper, lower)
In [17]:
# Compute local array of actual error and probability mass over ~100 km (which is about 62 miles)
probs = []
actual_errors = []
skipped = 0
distance = 100
distance_deg = distance/111.32
act_within = 0
cnt = 0
for i in range(0, len(local_tweets)):
try:
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(local_tweets[i])
if min_error != 0 and error_distance !=0:
best_point = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
upper = [best_point[0]+distance_deg,best_point[1]+distance_deg]
lower = [best_point[0]-distance_deg,best_point[1]-distance_deg]
mass = prob_mass(combined_gmm, upper, lower)
w = np.sum(combined_gmm.weights_)
if math.isnan(mass):
skipped += 1
else:
actual_loc = local_tweets[i].geo.coordinates
#print best_point, actual_loc, error_distance
probs.append(mass)
actual_errors.append(error_distance)
cnt += 1
#check if the actual_location is within the same 100 km box
if (best_point[0]-distance_deg<actual_loc[0]<best_point[0]+distance_deg and
best_point[1]-distance_deg<actual_loc[1]<best_point[1]+distance_deg):
act_within +=1
except:
#raise
#probs.append(0)
#actual_errors.append(0)
skipped += 1
correct_ratio = float(act_within)/float(cnt) *100
average_prob = np.average(probs)*100
print 'Skipped %d of %d'%(skipped, len(local_tweets))
print 'There were %d pct of tweets within radius and average probability of %f'%(correct_ratio, average_prob)
In [ ]:
radius = [25,50,75,100,125,150,200,250,300,350,400, 450, 500, 600, 700, 800, 900, 1000]
correct = []
average = []
for r in radius:
probs = []
actual_errors = []
skipped = 0
distance = r
distance_deg = distance/111.32
act_within = 0
cnt = 0
for i in range(0, len(local_tweets)):
try:
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(local_tweets[i])
if min_error != 0 and error_distance !=0:
best_point = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
upper = [best_point[0]+distance_deg,best_point[1]+distance_deg]
lower = [best_point[0]-distance_deg,best_point[1]-distance_deg]
mass = prob_mass(combined_gmm, upper, lower)
w = np.sum(combined_gmm.weights_)
if math.isnan(mass):
skipped += 1
else:
actual_loc = local_tweets[i].geo.coordinates
#print best_point, actual_loc, error_distance
probs.append(mass)
actual_errors.append(error_distance)
cnt += 1
#check if the actual_location is within the same 100 km box
if (best_point[0]-distance_deg<actual_loc[0]<best_point[0]+distance_deg and
best_point[1]-distance_deg<actual_loc[1]<best_point[1]+distance_deg):
act_within +=1
except:
#raise
#probs.append(0)
#actual_errors.append(0)
skipped += 1
correct_ratio = float(act_within)/float(cnt) *100
average_prob = np.average(probs)*100
correct.append(correct_ratio)
average.append(average_prob)
#print 'Skipped %d of %d'%(skipped, len(local_tweets))
print 'There were %f pct of tweets within %d radius and average probability of %f'%(correct_ratio, r, average_prob)
In [27]:
plt.plot(radius, correct)
plt.plot(radius, average, 'g')
Out[27]:
In [24]:
correct
Out[24]:
In [28]:
print probs
print np.average(probs)
print actual_errors
In [192]:
plt.figure()
#plt.plot(np.log(min_errors), np.log(actual_errors), '.')
plt.plot(probs, actual_errors, '.')
plt.axis([0,1,0,3000])
#plt.axis([0,np.log(3000),0,np.log(3000)])
#print min(actual_errors)
from scipy.stats import pearsonr
print pearsonr(probs, actual_errors)
print pearsonr(probs, np.log(actual_errors))
plt.figure()
plt.plot(np.log(probs), np.log(actual_errors), '.')
#plt.axis([0,np.log(.75),0,np.log(3000)])
Out[192]:
In [193]:
print min(probs), max(probs), np.average(probs)
In [194]:
(est_location, min_error, error_distance, combined_gmm) = get_gmm_info(local_tweets[85])
In [204]:
best_point = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
upper = [best_point[0]+distance_deg,best_point[1]+distance_deg]
lower = [best_point[0]-distance_deg,best_point[1]-distance_deg]
mass = prob_mass(combined_gmm, upper, lower)
print mass
In [202]:
total_prob = 0
for i in range(0, len(combined_gmm.weights_)):
val = ext.mvnormcdf(upper, combined_gmm.means_[i], combined_gmm.covars_[i], lower, maxpts=2000)
weighted_val = val * combined_gmm.weights_[i]
total_prob += weighted_val
if math.isnan(val):
print i, val, total_prob
In [133]:
plt.hist(probs)
Out[133]: