In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../data/raw/train.csv')

print train.shape


(29118021, 6)

In [3]:
train.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
row_id      int64
x           float64
y           float64
accuracy    int64
time        int64
place_id    int64
dtypes: float64(2), int64(4)
memory usage: 1.3 GB

In [4]:
train = train.sort_values(by=['time'], ascending=True)

In [5]:
#dataset['InstallDate'] = pd.to_datetime(dataset['time'], unit='ms')

In [6]:
train.head()


Out[6]:
row_id x y accuracy time place_id
15280869 15280869 2.9374 1.0010 433 1 7186804346
23020268 23020268 5.3673 5.6998 25 1 7449653826
3977976 3977976 3.1148 3.6161 8 1 8090429732
27743788 27743788 3.7725 6.4504 65 2 4115006237
3836334 3836334 3.5102 3.6296 8 3 8090429732

In [7]:
train.tail()


Out[7]:
row_id x y accuracy time place_id
726802 726802 0.1422 6.1894 63 786239 7811482171
22118299 22118299 5.6199 1.8609 165 786239 5164549189
13764839 13764839 5.2384 9.6714 85 786239 1291702704
19388897 19388897 5.4300 1.9423 35 786239 5796916425
11015770 11015770 5.6777 1.1298 60 786239 7588817182

In [8]:
train.describe()


Out[8]:
row_id x y accuracy time place_id
count 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07 2.911802e+07
mean 1.455901e+07 4.999770e+00 5.001814e+00 8.284912e+01 4.170104e+05 5.493787e+09
std 8.405649e+06 2.857601e+00 2.887505e+00 1.147518e+02 2.311761e+05 2.611088e+09
min 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.000016e+09
25% 7.279505e+06 2.534700e+00 2.496700e+00 2.700000e+01 2.030570e+05 3.222911e+09
50% 1.455901e+07 5.009100e+00 4.988300e+00 6.200000e+01 4.339220e+05 5.518573e+09
75% 2.183852e+07 7.461400e+00 7.510300e+00 7.500000e+01 6.204910e+05 7.764307e+09
max 2.911802e+07 1.000000e+01 1.000000e+01 1.033000e+03 7.862390e+05 9.999932e+09

In [ ]:
sb.distplot(train['accuracy'])
sb.distplot(train['x'])
sb.distplot(train['y'])
sb.distplot(train['accuracy']);
sb.distplot(train['time']);
sb.distplot(train['place_id']);

In [ ]:
sb.jointplot(x="x", y="y", data=train, kind="kde")

In [ ]: