In [ ]:
import os
import pandas
import tarfile
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers[0].setFormatter(logging.Formatter('%(asctime)s: %(message)s'))

locations = {
    'gfs': '/l/cnets/datasets/Telecom_BDC_2015',
    'diskstation': '/media/diskstation/Datasets/Telecom Big Data Challenge 2015',
    'data': os.path.expanduser('~/data/tbdc15'),
    'hdd': '/media/giovanni/Multimedia/Datasets/Telecom Big Data Challenge 2015',
    'repo': os.path.expanduser('~/repos/tbdc15')
}


def getpaths(city, loc='gfs', boxesloc=None, storeloc=None):
    root = locations[loc]
    city_codes = {'RO': 'RM'}
    code = city[:2].upper()
    if code in city_codes:
        code = city_codes[code]
    paths = {
        'trips': '{root}/infoblu/{city}.tar.gz'.format(root=root, city=city),
        'accidents': '{root}/unipol/BDC2015_UnipolsaiClaims2014_{city}.csv'.format(root=root, city=code),
        'boxes': '{root}/city_boxes.csv'.format(root=root if boxesloc is None else locations[boxesloc]),
        'store': '{root}/trip_accidents_store.hdf'.format(root=root if storeloc is None else locations[storeloc])
    }
    return paths


def getbox(path, city):
    city_code = city[0].lower()
    df_box = pandas.read_csv(path, index_col='city')
    df_box.ix[city_code]
    box = df_box.ix[city_code].to_dict()
    return box


def read_trips(path, box, scale=1000.0, break_at=None):
    index_columns = ['i', 'j', 'weekday', 'hour']
    trips = pandas.DataFrame(columns=index_columns + ['trips', 'trips_start']).set_index(index_columns)

    # set break_at to an integer and it will stop exactly after that number of iterations
    i = 0

    with tarfile.open(path, mode='r:gz') as tf:
        # open tar file in random access mode with on-the-fly gzip decompression
        for member in tf:
            if break_at is not None and i == break_at:
                break
            i += 1

            # read contents of TAR archive. Each file in the archive contains 
            # the data of a different day.
            logger.info(member.name)
            f = tf.extractfile(member)

            # do not use the "type" and "speed" columns, since we don't need them. This saves memory.
            df = pandas.read_csv(f, 
                                 names=['trip', 'timestamp', 'lat', 'lon', 'type', 'speed'],
                                 usecols=['trip', 'timestamp', 'lat', 'lon'],
                                 sep=';', 
                                 parse_dates=['timestamp'])

            # compute the cell, weekday, and hour
            df['i'] = ((df['lat'] - box['lat_min']) * scale).round()
            df['j'] = ((df['lon'] - box['lon_min']) * scale).round()
            df['weekday'] = df['timestamp'].map(pandas.Timestamp.weekday)
            df['hour'] = df['timestamp'].map(lambda k: k.hour)

            # count how many trips in each cell, weekday, hour and append. 
            # Note that the first group-by returns a series object, 
            # and we wrap this into a DataFrame.        
            s1 = df.filter(index_columns).groupby(index_columns).apply(len)

            # do the same but only considering the first frame of each trip.
            df_ff = df.groupby('trip', as_index=False).head(1)
            s2 = df_ff.filter(index_columns).groupby(index_columns).apply(len)

            df = pandas.DataFrame({'trips': s1, 'trips_start': s2})
            
            trips = trips.append(df)

    return trips


def read_accidents(path, box, scale=1000.0):
    index_columns = ['i', 'j', 'weekday', 'hour']    
    df = pandas.read_csv(path)
    df.rename(columns={'day_type': 'weekday', 'time_range': 'hour'}, inplace=True)
    df['i'] = ((df['latitude'] - box['lat_min']) * scale).round()
    df['j'] = ((df['longitude'] - box['lon_min']) * scale).round()
    s = df.groupby(index_columns).apply(len)
    accidents = pandas.DataFrame({'accidents': s})
    return accidents


def make_city_frame(city, 
                    loc='frosty', 
                    boxesloc='frosty',
                    storeloc='frosty',
                    scale=1000.0, 
                    break_at=None):
    """
    Reads data of trips and accidents and store data frame into HDF format
    """
    paths = getpaths(city, loc=location, boxesloc=boxesloc, storeloc=storeloc)
    box = getbox(paths['boxes'], city)
    logger.info("Reading trips...")
    trips = read_trips(paths['trips'], box, scale=scale, break_at=break_at)
    logger.info("Reading accidents...")
    accidents = read_accidents(paths['accidents'], box, scale=scale)
    logger.info("Storing data...")
    joined_df = trips.join(accidents).fillna(0).reset_index()
    joined_df.to_hdf(paths['store'], city, complib='blosc', complevel=6)
    logger.info("Data saved to HDF:".format(paths['store']))

Create dataset

Select city


In [ ]:
cities = ['bari', 'milano', 'napoli', 'palermo', 'roma', 'torino', 'venezia']
location = 'gfs'
boxes_location = 'data'
store_location = 'data'
scale = 1000.0

Read data

Use the following to remove an existing store file, if needed. (Use C-M y to make the cell runnable).

rm -f /u/gciampag/data/tbdc15/trip_accidents_store.hdf


In [ ]:
ll -h ~/data/tbdc15/

In [ ]:
for city in cities:
    logging.info("City: {}".format(city))
    make_city_frame(city, 
                    loc=location, 
                    scale=scale, 
                    boxesloc=boxes_location,
                    storeloc=store_location, 
                    break_at=None)

Plot the data


In [ ]:
%matplotlib inline
import numpy
from pylab import *

# scatter plot
def scatter_trips_accidents(df, city, use_trips_starts=False):
    fig = figure()
    if use_trips_starts:
        xcol = 'trips_start'
    else:
        xcol = 'trips'
            
    df.plot(x=xcol, y='accidents', kind='scatter', marker='x', alpha=.2, color='k', fig=fig)

    # trend line
    emin = numpy.log10(df[xcol].min())
    emax = numpy.log10(df[xcol].max())
    bins = numpy.logspace(max(emin, 0), emax, 20)
    print bins
    df.groupby(numpy.digitize(df[xcol], bins=bins)).mean().plot(x=xcol, y='accidents',
                                                                color='r', linestyle='solid',
                                                                marker='o', ax=gca(), alpha=.5, 
                                                                linewidth=2, fig=fig)
    grid('off')
    title(city)
    if use_trips_starts:
        xlabel('Traffic (start of trip)')
    else:
        xlabel('Traffic')
    ylabel('Accidents')
    xscale('log')
    xlim(1, xlim()[1])
    tight_layout()
    legend()
    savefig('trips_accidents_scatter_{}.pdf'.format(city))
    savefig('trips_accidents_scatter_{}.png'.format(city))
    show()


def hist_accidents(df, city):
    fig = figure()
    ax = gca()
    ax.hist(df['accidents'].values, log=True, bins=60, color='white')
    ylim(.1, ylim()[1])
    xlabel('Accidents')
    ylabel('Frequency')
    title(city)
    tight_layout()
    legend()
    savefig('accidents_histogram_{}.pdf'.format(city))
    savefig('accidents_histogram_{}.png'.format(city))
    show()
    
    
def plot_all(city):
    paths = getpaths(city, 
                     loc=location, 
                     boxesloc=boxes_location,
                     storeloc=store_location)
    df = pandas.read_hdf(paths['store'], city)
    df = df.groupby(['i', 'j']).sum().filter(['trips', 'trips_start', 'accidents'])
    scatter_trips_accidents(df, city)
    scatter_trips_accidents(df, city, use_trips_starts=True)
    hist_accidents(df, city)

In [ ]:
plot_all('bari')

In [ ]:
plot_all('milano')

In [ ]:
plot_all('napoli')

In [ ]:
plot_all('palermo')

In [ ]:
plot_all('roma')

In [ ]:
plot_all('torino')

In [ ]:
plot_all('venezia')

Scatter plot of trips vs trip starts

df.plot(x='trips_start', y='trips', kind='scatter', alpha=.2, marker='.') xscale('log') yscale('log') xlim(5e-1, 1e5) xlabel('Trip starts') ylabel('Trips') title(city) savefig("trips_trips_starts_scatter_{}_{}.pdf".format(city, scale)) savefig("trips_trips_starts_scatter_{}_{}.png".format(city, scale))

Distributions of accidents

Load the data


In [ ]:
city = 'palermo'
df = pandas.read_hdf('/u/gciampag/data/tbdc15/trip_accidents_store.hdf', city)
df = df.groupby(['i', 'j']).sum().filter(['trips', 'trips_start', 'accidents'])

Histogram grouping data $>0$ in bins of size $9$


In [ ]:
bin_size = 9  # lower bound on bin size
max_accidents = df['accidents'].max()
start = 1
stop = 1 + ceil((max_accidents - 1) / bin_size) * bin_size
num = (stop - start) / bin_size + 1
bins = numpy.linspace(start, stop, num, endpoint=True)
bins = numpy.hstack([[0,], bins])
nh, __, ___ = hist(df['accidents'].values, bins=bins, color='lightgray', log=True, normed=True, histtype='bar')
xlim(-5, stop)
ylim(1e-7, 1)
xlabel('Accidents')
ylabel('Frequency')
title(city.title())
tick_params(axis='both', direction='out', which='both')
tick_params(axis='x', which='minor', bottom='on', top='off')
tick_params(axis='y', which='both', right='off')
tick_params(axis='x', which='major', top='off')
tick_params(axis='x', which='minor', bottom='on')

Fit the $>0$ data with an exponential law (with binning), and geometric distribution


In [ ]:
from scipy.stats import expon, geom, poisson

group_size = 9

df_nza = df.query('accidents > 0')
a = df_nza.groupby(ceil(df_nza['accidents'] / group_size)).count()['accidents']

x = a.index.values

p = a / a.sum()
vlines(x, 0, p, color='LightGray')
plot(x, p, 'wo ', label='Data', ms=8)

# expected number of accidents (computed as a weighted average of the frequencies)
exp_accidents = np.sum(p.values * a.index.values)

#x = np.hstack([[0]])

rv = expon(loc=0, scale=exp_accidents ** -1)
plot(x, rv.cdf(x + 1) - rv.cdf(x), 'x ', color='k', mew=2, label='Exponential')

rv = geom(exp_accidents ** -1, loc=0)
plot(x, rv.pmf(x), '+ ', color='gray', mew=1.5, ms=10, label='Geometric')

rv = poisson(exp_accidents ** -1, loc=0)
plot(x, rv.pmf(x), marker=(6, 2, 0), ls=' ', color='LightGray', mew=1, ms=10, label='Poisson')


xlim(0, xlim()[1] + 1)
xlabel(r'$\left\lceil\rm{Accidents} \,/\, %d\right\rceil$' % group_size, fontsize='large')
ylabel('Probability')
yscale('log')
ylim(ylim()[0], 2)
title(city.title())
legend(loc='best', frameon=False)
tick_params(axis='both', direction='out', which='both')
tick_params(axis='x', which='minor', bottom='on', top='off')
tick_params(axis='y', which='both', right='off')
tick_params(axis='x', which='major', top='off')
tick_params(axis='x', which='minor', bottom='on')
savefig("accidents_grouped_fit_{}.png".format(city))

Zero-inflated geometric distribution


In [ ]:
import numpy
from scipy.stats import rv_discrete

class zigeom_gen(rv_discrete):
    def _pmf(self, k, pi, p):
        s = numpy.sign(k)
        return (1 - s) * (pi + (1.0 - pi) * p) + s * (1.0 - pi) * (1.0 - p) ** k * p
            
zigeom = zigeom_gen()

Simulate from a zero-inflated geometric


In [ ]:
from scipy.optimize import minimize

def fit(data):
    def _llk(args):
        return - zigeom(*args).logpmf(data).sum()
    N = float(len(data))
    pi0 = (data == 0).sum() / N
    x0 = (pi0, .5)
    ret = minimize(_llk, x0, method='Nelder-Mead')
    if ret['success']:
        return ret['x']
    else:
        raise RuntimeError(ret['message'])

pi = .2
p = .3
data = zigeom(pi, p).rvs(size=1000)
print fit(data)

In [ ]:
p = 0.1
pi = 0.5
data = zigeom(pi, p).rvs(size=2000)
data_max = data.max()
hist(data, bins=data_max, color='white', log=True, normed=True)
pih, ph = fit(data)
x = np.arange(data_max)
px = zigeom(pih, ph).pmf(x)
plot(x + .5, px, 'r-')
title('$\pi = {}, p = {}$'.format(pi, p))

Compare the trip distribution for cells with $0$ accidents with a normal cell (with $\ge 0$ accidents)


In [ ]:
import numpy
from truthy_measure.plotting import plot_pdf_log2, plot_cdf

trips_all = df['trips'].values
trips_zac = df.query('accidents == 0')['trips'].values
num_points = 20
bins = numpy.logspace(0, numpy.log2(trips_all.max()), num=num_points, base=2)
hist_all, __ = numpy.histogram(trips_all, bins=bins, normed=True)
hist_zac, __ = numpy.histogram(trips_zac, bins=bins, normed=True)
plot(bins[1:], numpy.log(hist_zac) - numpy.log(hist_all), 'ko:', mfc='LightGray')
axhline(0, ls='--', color='gray', lw=2)
xscale('log')
xlabel('Trips $t$')
ylabel('$\log\Pr\{T = t | A = 0\} - \log\Pr\{T\}$')
yb = max(numpy.abs(ylim()))
ylim(-yb, yb)
title(city.title())
tight_layout()
savefig("logratio_{}.png".format(city))

In [ ]: