In [1]:
%matplotlib inline
import time
import pylab
import numpy as np
import pandas as pd
import pycupid.locations
In [5]:
people = pd.read_json('/Users/ajmendez/data/okcupid/random.json')
print('Scraping archive found {:,d} random people'.format(len(people)))
In [271]:
locations = people['location'].astype(unicode)#.replace(r'\s+', np.nan, regex=True)
isgood = (locations.str.extract((u'(\u2026)')).isnull()) & (locations.str.len() > 0)
noriginal = len(locations.unique())
unique_locations = locations[isgood].unique()
nlocations = len(unique_locations)
print('There are a total of {:,d} unique locations and {:,d} good ones'.format(noriginal, nlocations))
print(' > missing locations: {:0.1f}%'.format((noriginal-nlocations)*100.0/noriginal))
print(' > missing people: {:0.1f}%'.format((len(locations)-len(np.where(isgood)[0]))*100.0/len(locations)))
In [210]:
# does not seem to pickup the lat/lon notation from the old db
location_map = pd.read_json('/Users/ajmendez/data/okcupid/location_map.json', orient='index')
location_map.columns = ['lat', 'lon']
print('Location cache contains {:,d} locations'.format(len(location_map)))
In [ ]:
# load v2:
location_map = pd.read_json('/Users/ajmendez/data/okcupid/locations_v2.json', orient='index')
In [272]:
geonames = pycupid.locations.getGN()
inew = 0
for i, location in enumerate(unique_locations):
if location in location_map.index:
continue
print u'Getting location: {}'.format(location)
try:
loc, (lat, lon) = geonames.geocode(location.encode('utf8'))
except Exception as e:
print u' > Failed: {}'.format(location)
# raise e
# too many loc* names!
location_map.loc[location] = [lat,lon]
inew += 1
# give the API a bit of a break
time.sleep(0.2)
if inew > 1000:
break
print len(location_map)
In [269]:
location_map.to_json('/Users/ajmendez/data/okcupid/locations_v2.json', orient='index')
In [259]:
finished = []
for i, location in enumerate(location_map.index):
if location in finished:
continue
tmp = location_map.loc[location]
isloc = (locations == location)
people.loc[isloc, 'lat'] = tmp['lat']
people.loc[isloc, 'lon'] = tmp['lon']
people.loc[isloc, 'nloc'] = isloc.sum()
finished.append(location)
if (i%1000 == 0):
print i,
In [279]:
# better plots later, this is just a test
people.plot('lon', 'lat', kind='scatter', s=2, lw=0, alpha=0.1)
Out[279]:
In [262]:
people.to_csv('/Users/ajmendez/data/okcupid/random_v2.csv', encoding='utf-8')
In [6]:
people = pd.read_csv('/Users/ajmendez/data/okcupid/random_v2.csv')
In [31]:
tmp = people['username'].str.extract((u'(\d+)'))
people['username_number'] = tmp.apply(lambda x: int(x) if isinstance(x, (str, unicode)) else np.nan)
people['username_nlength'] = tmp.apply(lambda x: len(x) if isinstance(x, (str,unicode)) else 0)
In [32]:
people.to_csv('/Users/ajmendez/data/okcupid/random_v3.csv', encoding='utf-8')
In [29]:
names = ['dinosaur', 'saur','saurus', 'dino','jurassic', 'rex', 'sarus',
'pterodactyl', 'archaeopter', 'pteranod', 'pterodact']
people['hasdino'] = people['username'].str.lower().str.extract((u'({})'.format('|'.join(names)))).notnull()
In [30]:
people.to_csv('/Users/ajmendez/data/okcupid/random_v4.csv', encoding='utf-8')
In [5]:
people = pd.read_csv('/Users/ajmendez/data/okcupid/random_v2.csv')
people.to_json('/Users/ajmendez/data/okcupid/random_v2.json', orient='index')
In [ ]: