In [ ]:
import os
import pandas
import tarfile
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers[0].setFormatter(logging.Formatter('%(asctime)s: %(message)s'))
locations = {
'gfs': '/l/cnets/datasets/Telecom_BDC_2015',
'diskstation': '/media/diskstation/Datasets/Telecom Big Data Challenge 2015',
'data': os.path.expanduser('~/data/tbdc15'),
'hdd': '/media/giovanni/Multimedia/Datasets/Telecom Big Data Challenge 2015',
'repo': os.path.expanduser('~/repos/tbdc15')
}
def getpaths(city, loc='gfs', boxesloc=None, storeloc=None):
root = locations[loc]
city_codes = {'RO': 'RM'}
code = city[:2].upper()
if code in city_codes:
code = city_codes[code]
paths = {
'trips': '{root}/infoblu/{city}.tar.gz'.format(root=root, city=city),
'accidents': '{root}/unipol/BDC2015_UnipolsaiClaims2014_{city}.csv'.format(root=root, city=code),
'boxes': '{root}/city_boxes.csv'.format(root=root if boxesloc is None else locations[boxesloc]),
'store': '{root}/trip_accidents_store.hdf'.format(root=root if storeloc is None else locations[storeloc])
}
return paths
def getbox(path, city):
city_code = city[0].lower()
df_box = pandas.read_csv(path, index_col='city')
df_box.ix[city_code]
box = df_box.ix[city_code].to_dict()
return box
def read_trips(path, box, scale=1000.0, break_at=None):
index_columns = ['i', 'j', 'weekday', 'hour']
trips = pandas.DataFrame(columns=index_columns + ['trips', 'trips_start']).set_index(index_columns)
# set break_at to an integer and it will stop exactly after that number of iterations
i = 0
with tarfile.open(path, mode='r:gz') as tf:
# open tar file in random access mode with on-the-fly gzip decompression
for member in tf:
if break_at is not None and i == break_at:
break
i += 1
# read contents of TAR archive. Each file in the archive contains
# the data of a different day.
logger.info(member.name)
f = tf.extractfile(member)
# do not use the "type" and "speed" columns, since we don't need them. This saves memory.
df = pandas.read_csv(f,
names=['trip', 'timestamp', 'lat', 'lon', 'type', 'speed'],
usecols=['trip', 'timestamp', 'lat', 'lon'],
sep=';',
parse_dates=['timestamp'])
# compute the cell, weekday, and hour
df['i'] = ((df['lat'] - box['lat_min']) * scale).round()
df['j'] = ((df['lon'] - box['lon_min']) * scale).round()
df['weekday'] = df['timestamp'].map(pandas.Timestamp.weekday)
df['hour'] = df['timestamp'].map(lambda k: k.hour)
# count how many trips in each cell, weekday, hour and append.
# Note that the first group-by returns a series object,
# and we wrap this into a DataFrame.
s1 = df.filter(index_columns).groupby(index_columns).apply(len)
# do the same but only considering the first frame of each trip.
df_ff = df.groupby('trip', as_index=False).head(1)
s2 = df_ff.filter(index_columns).groupby(index_columns).apply(len)
df = pandas.DataFrame({'trips': s1, 'trips_start': s2})
trips = trips.append(df)
return trips
def read_accidents(path, box, scale=1000.0):
index_columns = ['i', 'j', 'weekday', 'hour']
df = pandas.read_csv(path)
df.rename(columns={'day_type': 'weekday', 'time_range': 'hour'}, inplace=True)
df['i'] = ((df['latitude'] - box['lat_min']) * scale).round()
df['j'] = ((df['longitude'] - box['lon_min']) * scale).round()
s = df.groupby(index_columns).apply(len)
accidents = pandas.DataFrame({'accidents': s})
return accidents
def make_city_frame(city,
loc='frosty',
boxesloc='frosty',
storeloc='frosty',
scale=1000.0,
break_at=None):
"""
Reads data of trips and accidents and store data frame into HDF format
"""
paths = getpaths(city, loc=location, boxesloc=boxesloc, storeloc=storeloc)
box = getbox(paths['boxes'], city)
logger.info("Reading trips...")
trips = read_trips(paths['trips'], box, scale=scale, break_at=break_at)
logger.info("Reading accidents...")
accidents = read_accidents(paths['accidents'], box, scale=scale)
logger.info("Storing data...")
joined_df = trips.join(accidents).fillna(0).reset_index()
joined_df.to_hdf(paths['store'], city, complib='blosc', complevel=6)
logger.info("Data saved to HDF:".format(paths['store']))
In [ ]:
cities = ['bari', 'milano', 'napoli', 'palermo', 'roma', 'torino', 'venezia']
location = 'gfs'
boxes_location = 'data'
store_location = 'data'
scale = 1000.0
Use the following to remove an existing store file, if needed. (Use C-M y
to make the cell runnable).
rm -f /u/gciampag/data/tbdc15/trip_accidents_store.hdf
In [ ]:
ll -h ~/data/tbdc15/
In [ ]:
for city in cities:
logging.info("City: {}".format(city))
make_city_frame(city,
loc=location,
scale=scale,
boxesloc=boxes_location,
storeloc=store_location,
break_at=None)
In [ ]:
%matplotlib inline
import numpy
from pylab import *
# scatter plot
def scatter_trips_accidents(df, city, use_trips_starts=False):
fig = figure()
if use_trips_starts:
xcol = 'trips_start'
else:
xcol = 'trips'
df.plot(x=xcol, y='accidents', kind='scatter', marker='x', alpha=.2, color='k', fig=fig)
# trend line
emin = numpy.log10(df[xcol].min())
emax = numpy.log10(df[xcol].max())
bins = numpy.logspace(max(emin, 0), emax, 20)
print bins
df.groupby(numpy.digitize(df[xcol], bins=bins)).mean().plot(x=xcol, y='accidents',
color='r', linestyle='solid',
marker='o', ax=gca(), alpha=.5,
linewidth=2, fig=fig)
grid('off')
title(city)
if use_trips_starts:
xlabel('Traffic (start of trip)')
else:
xlabel('Traffic')
ylabel('Accidents')
xscale('log')
xlim(1, xlim()[1])
tight_layout()
legend()
savefig('trips_accidents_scatter_{}.pdf'.format(city))
savefig('trips_accidents_scatter_{}.png'.format(city))
show()
def hist_accidents(df, city):
fig = figure()
ax = gca()
ax.hist(df['accidents'].values, log=True, bins=60, color='white')
ylim(.1, ylim()[1])
xlabel('Accidents')
ylabel('Frequency')
title(city)
tight_layout()
legend()
savefig('accidents_histogram_{}.pdf'.format(city))
savefig('accidents_histogram_{}.png'.format(city))
show()
def plot_all(city):
paths = getpaths(city,
loc=location,
boxesloc=boxes_location,
storeloc=store_location)
df = pandas.read_hdf(paths['store'], city)
df = df.groupby(['i', 'j']).sum().filter(['trips', 'trips_start', 'accidents'])
scatter_trips_accidents(df, city)
scatter_trips_accidents(df, city, use_trips_starts=True)
hist_accidents(df, city)
In [ ]:
plot_all('bari')
In [ ]:
plot_all('milano')
In [ ]:
plot_all('napoli')
In [ ]:
plot_all('palermo')
In [ ]:
plot_all('roma')
In [ ]:
plot_all('torino')
In [ ]:
plot_all('venezia')
In [ ]:
city = 'palermo'
df = pandas.read_hdf('/u/gciampag/data/tbdc15/trip_accidents_store.hdf', city)
df = df.groupby(['i', 'j']).sum().filter(['trips', 'trips_start', 'accidents'])
In [ ]:
bin_size = 9 # lower bound on bin size
max_accidents = df['accidents'].max()
start = 1
stop = 1 + ceil((max_accidents - 1) / bin_size) * bin_size
num = (stop - start) / bin_size + 1
bins = numpy.linspace(start, stop, num, endpoint=True)
bins = numpy.hstack([[0,], bins])
nh, __, ___ = hist(df['accidents'].values, bins=bins, color='lightgray', log=True, normed=True, histtype='bar')
xlim(-5, stop)
ylim(1e-7, 1)
xlabel('Accidents')
ylabel('Frequency')
title(city.title())
tick_params(axis='both', direction='out', which='both')
tick_params(axis='x', which='minor', bottom='on', top='off')
tick_params(axis='y', which='both', right='off')
tick_params(axis='x', which='major', top='off')
tick_params(axis='x', which='minor', bottom='on')
In [ ]:
from scipy.stats import expon, geom, poisson
group_size = 9
df_nza = df.query('accidents > 0')
a = df_nza.groupby(ceil(df_nza['accidents'] / group_size)).count()['accidents']
x = a.index.values
p = a / a.sum()
vlines(x, 0, p, color='LightGray')
plot(x, p, 'wo ', label='Data', ms=8)
# expected number of accidents (computed as a weighted average of the frequencies)
exp_accidents = np.sum(p.values * a.index.values)
#x = np.hstack([[0]])
rv = expon(loc=0, scale=exp_accidents ** -1)
plot(x, rv.cdf(x + 1) - rv.cdf(x), 'x ', color='k', mew=2, label='Exponential')
rv = geom(exp_accidents ** -1, loc=0)
plot(x, rv.pmf(x), '+ ', color='gray', mew=1.5, ms=10, label='Geometric')
rv = poisson(exp_accidents ** -1, loc=0)
plot(x, rv.pmf(x), marker=(6, 2, 0), ls=' ', color='LightGray', mew=1, ms=10, label='Poisson')
xlim(0, xlim()[1] + 1)
xlabel(r'$\left\lceil\rm{Accidents} \,/\, %d\right\rceil$' % group_size, fontsize='large')
ylabel('Probability')
yscale('log')
ylim(ylim()[0], 2)
title(city.title())
legend(loc='best', frameon=False)
tick_params(axis='both', direction='out', which='both')
tick_params(axis='x', which='minor', bottom='on', top='off')
tick_params(axis='y', which='both', right='off')
tick_params(axis='x', which='major', top='off')
tick_params(axis='x', which='minor', bottom='on')
savefig("accidents_grouped_fit_{}.png".format(city))
In [ ]:
import numpy
from scipy.stats import rv_discrete
class zigeom_gen(rv_discrete):
def _pmf(self, k, pi, p):
s = numpy.sign(k)
return (1 - s) * (pi + (1.0 - pi) * p) + s * (1.0 - pi) * (1.0 - p) ** k * p
zigeom = zigeom_gen()
In [ ]:
from scipy.optimize import minimize
def fit(data):
def _llk(args):
return - zigeom(*args).logpmf(data).sum()
N = float(len(data))
pi0 = (data == 0).sum() / N
x0 = (pi0, .5)
ret = minimize(_llk, x0, method='Nelder-Mead')
if ret['success']:
return ret['x']
else:
raise RuntimeError(ret['message'])
pi = .2
p = .3
data = zigeom(pi, p).rvs(size=1000)
print fit(data)
In [ ]:
p = 0.1
pi = 0.5
data = zigeom(pi, p).rvs(size=2000)
data_max = data.max()
hist(data, bins=data_max, color='white', log=True, normed=True)
pih, ph = fit(data)
x = np.arange(data_max)
px = zigeom(pih, ph).pmf(x)
plot(x + .5, px, 'r-')
title('$\pi = {}, p = {}$'.format(pi, p))
In [ ]:
import numpy
from truthy_measure.plotting import plot_pdf_log2, plot_cdf
trips_all = df['trips'].values
trips_zac = df.query('accidents == 0')['trips'].values
num_points = 20
bins = numpy.logspace(0, numpy.log2(trips_all.max()), num=num_points, base=2)
hist_all, __ = numpy.histogram(trips_all, bins=bins, normed=True)
hist_zac, __ = numpy.histogram(trips_zac, bins=bins, normed=True)
plot(bins[1:], numpy.log(hist_zac) - numpy.log(hist_all), 'ko:', mfc='LightGray')
axhline(0, ls='--', color='gray', lw=2)
xscale('log')
xlabel('Trips $t$')
ylabel('$\log\Pr\{T = t | A = 0\} - \log\Pr\{T\}$')
yb = max(numpy.abs(ylim()))
ylim(-yb, yb)
title(city.title())
tight_layout()
savefig("logratio_{}.png".format(city))
In [ ]: