In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('../data/raw/train.csv')

print train.shape


(29118021, 6)

In [4]:
train.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
row_id      int64
x           float64
y           float64
accuracy    int64
time        int64
place_id    int64
dtypes: float64(2), int64(4)
memory usage: 1.3 GB

In [5]:
train = train.sort_values(by=['time'], ascending=True)

In [6]:
#dataset['InstallDate'] = pd.to_datetime(dataset['time'], unit='ms')

In [7]:
train[1:10]


Out[7]:
row_id x y accuracy time place_id
23020268 23020268 5.3673 5.6998 25 1 7449653826
3977976 3977976 3.1148 3.6161 8 1 8090429732
27743788 27743788 3.7725 6.4504 65 2 4115006237
3836334 3836334 3.5102 3.6296 8 3 8090429732
5714375 5714375 6.5795 6.9724 6 3 9359956624
15989877 15989877 9.6108 0.3647 50 3 1159242730
23744019 23744019 9.3776 9.7209 68 3 3382046185
23256065 23256065 0.2581 8.4232 169 4 9975915421
27036356 27036356 9.5923 4.1751 9 4 9277108143

In [7]:
train.tail()


Out[7]:
row_id x y accuracy time place_id
726802 726802 0.1422 6.1894 63 786239 7811482171
22118299 22118299 5.6199 1.8609 165 786239 5164549189
13764839 13764839 5.2384 9.6714 85 786239 1291702704
19388897 19388897 5.4300 1.9423 35 786239 5796916425
11015770 11015770 5.6777 1.1298 60 786239 7588817182

In [8]:
train.describe()


Out[8]:
row_id x y accuracy time place_id
count 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07
mean 1.455901e+07 4.999770e+00 5.001814e+00 8.284912e+01 4.170104e+05 5.493787e+09
std 8.405649e+06 2.857601e+00 2.887505e+00 1.147518e+02 2.311761e+05 2.611088e+09
min 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.000016e+09
25% 7.279505e+06 2.534700e+00 2.496700e+00 2.700000e+01 2.030570e+05 3.222911e+09
50% 1.455901e+07 5.009100e+00 4.988300e+00 6.200000e+01 4.339220e+05 5.518573e+09
75% 2.183852e+07 7.461400e+00 7.510300e+00 7.500000e+01 6.204910e+05 7.764307e+09
max 2.911802e+07 1.000000e+01 1.000000e+01 1.033000e+03 7.862390e+05 9.999932e+09

In [ ]:
sb.distplot(train['accuracy'])
sb.distplot(train['x'])
sb.distplot(train['y'])
sb.distplot(train['accuracy']);
sb.distplot(train['time']);
sb.distplot(train['place_id']);

In [9]:
with sb.axes_style("white"):
    sb.jointplot(x=train['x'], y=train['y'], kind="hex", color="k");



In [10]:
with sb.axes_style("white"):
    sb.jointplot(x=train['accuracy'], y=train['time'], kind="hex", color="k");



In [ ]:
with sb.axes_style("white"):
    sb.jointplot(x=train['place_id'], y=train['accuracy'], kind="hex", color="k");

In [ ]: