In [15]:
import os
import json
import datetime
import pprint
import math
API_KEY = "0f60efd9302d38f88380c6a4608bd9be"
dir_weather = "../dataset/weather"
file_business = "../dataset/yelp_academic_dataset_business.json"
file_users = "../dataset/yelp_academic_dataset_user.json"
file_reviews = "../dataset/yelp_academic_dataset_review.json"
file_users_centroid = "../dataset/user-centroid.json"
file_users_reviews = "../dataset/user-location.json"
file_feature_name = "feature-climate.data"
_business = {}
_userLocations = {}
_userReviews = {}
_weather = {}
In [16]:
def loadWeather():
f = os.listdir(dir_weather)
c = 0
for l in f:
if (l == '.DS_Store'):
continue
folder = dir_weather + '/' + l
files = os.listdir(folder)
for f in files:
if (f == '.DS_Store'):
continue
fo = open(folder + '/' + f)
for i in fo:
try:
j = json.loads(i)
except Exception, e:
print f
raise
data = j['daily']['data'][0]
d2 = datetime.datetime.utcfromtimestamp(int(data['time']))
w = l + '-' + d2.strftime('%Y-%m-%d')
try:
_weather[w.lower()] = '%s temperatureMin:%f temperatureMax:%f' % (data['icon'], data['temperatureMin'], data['temperatureMax'])
except Exception, e:
print 'Local: %s - Data: %s' % (l, d2.strftime('%d/%m/%Y %H:%M:%S %Z'))
# if (datetime.datetime.now() >= d2):
# print json.dumps(j)
# raise
In [17]:
def loadUserReviews():
global _userReviews
_userReviews = {}
f = open(file_users_reviews)
_userReviews = json.load(f)
xr = {}
c = 0
for k in _userReviews.keys():
c += 1
u = _userReviews[k]
b = []
for r in u['reviews']:
bid = r['business_id']
b.append(bid)
qtd = len(list(set(b)))
q = 0
if (qtd in xr):
q = xr[qtd] + 1
else:
q = 1
xr[qtd] = q
pprint.pprint(xr)
print 'total: %d' % c
In [18]:
def getUserLocations():
global _userLocations
_userLocations = {}
f = open(file_users_centroid)
for line in f:
j = json.loads(line)
u = {}
u['lat'] = j['lat']
u['lng'] = j['lng']
_userLocations[j['user_id']] = u
def getUserAttributes():
f = open(file_users)
xr = {}
for line in f:
j = json.loads(line)
#xr[j['user_id']] = 'user_id__%s average_stars:%s review_counts:%s' % (j['user_id'], str(j['average_stars']), str(j['review_count']))
xr[j['user_id']] = 'average_stars:%s review_counts:%s' % (str(j['average_stars']), str(j['review_count']))
f.close()
return xr
In [19]:
#Item attributes
def getBusinessAttributes():
global _business
_business = {}
f = open(file_business)
xr = {}
for line in f:
j = json.loads(line)
_business[j['business_id']] = j
cat = ' '.join(getCategories(j['categories']))
attr = ' '.join(getAttributes(j['attributes'], None))
#xr[j['business_id']] = 'business_id__%s open:%d stars:%d %s %s' % (j['business_id'], int(j['open']), j['stars'], cat.lower(), attr.lower())
xr[j['business_id'] + '-attr'] = 'stars:%d %s' % (j['stars'], attr.lower())
xr[j['business_id'] + '-cat'] = cat.lower()
f.close()
return xr
In [20]:
#Item categories
def getCategories(attr):
result = []
for k in attr:
result.append(k.replace(" ", "-"))
return result
#Item attributes
def getAttributes(attrs, attr):
result = []
for a in attrs:
key = a
value = attrs[a]
if (type(value) is dict):
result.extend(getAttributes(value, key))
else:
if (attr is not None):
key = attr + '_' + key
v = value
if (type(value) is unicode):
value = key + '__' + v
elif(type(value) is bool):
v = str(int(attrs[a]))
value = key + ':' + v
else:
value = key + ':' + str(v)
value = value.replace(" ", "-")
result.append(value)
return result
In [21]:
#Context attributes
def getContextAttributes(review):
#temporal
temporal = getTemporalAttributes(review)
#weather
weather = getWeatherAttributes(review)
#distance
distance = getDistanceAttributes(review['business_id'], review['user_id'])
attributes = '%s %s %s' % (temporal, distance, weather)
return attributes.lower()
def getTemporalAttributes(review):
date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
dayofweek = date.strftime('%A').lower()
weekday = int((date.isoweekday() < 6))
weekend = int((date.isoweekday() >= 6))
month = date.strftime('%B').lower()
season = getSeason(date, 'north')
context = '%s weekday:%d weekend:%d %s %s %s' % (dayofweek, weekday, weekend, month, season, review['date'])
return context
In [22]:
_city = []
def getWeatherAttributes(review):
b = _business[review['business_id']]
name = b['city']
name = name.strip()
weatherName = name + '-' + review['date']
weatherName = weatherName.replace('"', '').lower()
v = 'nothing'
try:
v = _weather[weatherName]
except Exception, e:
if ('las vegas' in weatherName):
weatherName = 'las vegas' + '-' + review['date']
v = _weather[weatherName]
else:
print 'error'
v = 'not-found'
_city.append(weatherName)
finally:
return v
In [23]:
def getSeason(date, hemisphere):
md = date.month * 100 + date.day
if ((md > 320) and (md < 621)):
s = 0 #spring
elif ((md > 620) and (md < 923)):
s = 1 #summer
elif ((md > 922) and (md < 1223)):
s = 2 #fall
else:
s = 3 #winter
if not hemisphere == 'north':
s = (s + 2) % 3
if (s == 0):
return 'spring'
elif (s == 1):
return 'summer'
elif (s == 2):
return 'fall'
else:
return 'winter'
In [24]:
def getDistanceAttributes(business, user):
lat1 = _userLocations[user]['lat']
lng1 = _userLocations[user]['lng']
lat2 = _business[business]['latitude']
lng2 = _business[business]['longitude']
distance = distance_on_unit_sphere(lat1, lng1, lat2, lng2)
km = distance / 1000
if (km <= 5):
return 'near'
elif (km > 5 and km <= 20):
return 'medium'
else:
return 'far'
In [25]:
def feq(a,b):
if abs(a-b)<0.00000001:
return 1
else:
return 0
def distance_on_unit_sphere(lat1, long1, lat2, long2):
try:
if (feq(lat1, lat2) and feq(long1, long2)):
return 0
# Convert latitude and longitude to
# spherical coordinates in radians.
degrees_to_radians = math.pi/180.0
# phi = 90 - latitude
phi1 = (90.0 - lat1)*degrees_to_radians
phi2 = (90.0 - lat2)*degrees_to_radians
# theta = longitude
theta1 = long1*degrees_to_radians
theta2 = long2*degrees_to_radians
# Compute spherical distance from spherical coordinates.
# For two locations in spherical coordinates
# (1, theta, phi) and (1, theta, phi)
# cosine( arc length ) =
# sin phi sin phi' cos(theta-theta') + cos phi cos phi'
# distance = rho * arc length
cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
math.cos(phi1)*math.cos(phi2))
arc = math.acos( cos )
# Remember to multiply arc by the radius of the earth
# in your favorite set of units to get length.
distance = math.degrees(arc) # in degrees
distance = distance * 60 # 60 nautical miles / lat degree
distance = distance * 1852 # conversion to meters
distance = round(distance)
return distance
except:
print 'lat1: %f lng1: %f | lat2: %f lng2: %f' % (lat1, long1, lat2, long2)
raise
In [217]:
print 'reading users'
users = getUserAttributes()
getUserLocations()
items = getBusinessAttributes()
loadWeather()
In [250]:
user_tags = {}
f = open(file_reviews)
i = 0
for line in f:
item = json.loads(line)
date = datetime.datetime.strptime(item['date'], '%Y-%m-%d')
if date.year >= 2013:
user_id = item['user_id']
item_id = item['business_id']
review_id = item['review_id']
if user_id not in user_tags:
user_tags[user_id] = {}
user_tags[user_id]['categories'] = {}
user_tags[user_id]['attributes'] = {}
user_tags[user_id]['distances'] = {}
user_tags[user_id]['weather'] = {}
user_tags[user_id]['count'] = 1
else:
user_tags[user_id]['count'] += 1
#distances
distance = getDistanceAttributes(item_id, user_id)
distante = distance.lower()
if distance not in user_tags[user_id]['distances']:
user_tags[user_id]['distances'][distance] = 1
else:
user_tags[user_id]['distances'][distance] += 1
#weather
weather = getWeatherAttributes(item)
weather = weather.lower()
if " " in weather:
weather = weather[:weather.index(" ")].strip()
if weather not in user_tags[user_id]['weather']:
user_tags[user_id]['weather'][weather] = 1
else:
user_tags[user_id]['weather'][weather] += 1
#attributes
nItemAttr = items[item_id + '-attr']
nItemAttr = nItemAttr[8:].split(' ')
for attr in nItemAttr:
attr_split = attr.split(':')
if attr_split[0] == 'price-range':
attr_split[0] = attr_split[0] + '_' + attr_split[1]
if len(attr_split) > 1 and int(attr_split[1]) >= 1:
if attr_split[0] not in user_tags[user_id]['attributes']:
user_tags[user_id]['attributes'][attr_split[0]] = 1
else:
user_tags[user_id]['attributes'][attr_split[0]] += 1
#categories
nItemCat = items[item_id + '-cat']
nItemCat = nItemCat.split(' ')
for cat in nItemCat:
if cat not in user_tags[user_id]['categories']:
user_tags[user_id]['categories'][cat] = 1
else:
user_tags[user_id]['categories'][cat] += 1
print 'user_tags carregado'
In [251]:
fw = open('user-tags.data', 'w')
for ut in user_tags:
if user_tags[ut]['count'] >= 4:
fw.write('%s %s\n' % (ut, json.dumps(user_tags[ut])))
fw.close()
print 'arquivo gerado'
In [252]:
import json
f = open('user-tags.data')
max_tags = 0
reviews_count = 0
max_tags_id = ''
line = ''
attrs = []
for u in f:
uid = u[:u.index(' {')].strip()
attr = u[u.index('{'):]
item = json.loads(attr)
tags = len(item['attributes'].keys()) + len(item['categories'].keys())
fw = open('user-tags/' + uid + '.txt', 'w')
for a in item['attributes']:
attr = a.replace('_', ' ').replace('-', ' ')
fw.write('{text: "%s", weight: %d}\n' % (attr, item['attributes'][a]))
for d in item['distances']:
fw.write('{text: "distance-%s", weight: %d}\n' % (d, item['distances'][d]))
for w in item['weather']:
fw.write('{text: "%s", weight: %d}\n' % (w, item['weather'][w]))
for c in item['categories']:
cat = c.replace('_', ' ').replace('-', ' ')
fw.write('{text: "%s", weight: %d}\n' % (cat, 2*item['categories'][c]))
fw.close()
f.close()
print 'files generated by user'