Title

done on spark DF to do as if large data set mainly DataFrame API, then SQL API Pandas DataFrame (eager transformation) Check when to do cache()



In [2]:

    
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [5]:

    
%config InlineBackend.figure_format = 'retina'



In [6]:

    
import sys, platform
platform.python_version()









    Out[6]:





'3.5.2'



In [7]:

    
import numpy as np



In [8]:

    
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)



In [9]:

    
from pyspark import SparkContext, SQLContext, SparkConf
conf = SparkConf().setMaster("local").set("spark.driver.memory", "1g").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

## spark #spark object entry point Spark 2.0, handle to a spark session
sc
sqlContext









    Out[9]:





        
            SparkContext

            Spark UI

            
              Version
                v2.2.0
              Master
                local
              AppName
                pyspark-shell
            
        
        






    Out[9]:





<pyspark.sql.context.SQLContext at 0x112e72208>



In [10]:

    
from pyspark.sql import SparkSession
spark = SparkSession(sc)



In [11]:

    
spark.version









    Out[11]:





'2.2.0'



In [ ]:



In [12]:

    
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")









    Out[12]:





'200'






    Out[12]:





'3'



In [13]:

    
data_path = 'Data/zubie_trips_anonymous/'



In [14]:

    
hadoop = spark._jvm.org.apache.hadoop
fs   = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration() 
path = hadoop.fs.Path(data_path)

for f in fs.get(conf).listStatus(path):
    name =  str(f.getPath())
    if name.endswith('.parquet'):
        print ("{}, rows: {} ".format( name.split('/')[-1], sqlContext.read.parquet(name).count() ))









    



part-00000-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 8530 
part-00001-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 4922 
part-00002-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 3115 
part-00003-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 2202 
part-00004-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1701 
part-00005-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 2508 
part-00006-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1803 
part-00007-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1357 
part-00008-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1102 
part-00009-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 856 
part-00010-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 534 
part-00011-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 281 
part-00012-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 41 
part-00013-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00014-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00015-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00016-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00017-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00018-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00019-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00020-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00021-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00022-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00023-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00024-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00025-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00026-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00027-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00028-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00029-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00030-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00031-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00032-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00033-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00034-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00035-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00036-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00037-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0



In [15]:

    
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, LongType, FloatType, DoubleType



In [16]:

    
df = spark.read.parquet(data_path)

print ( "total num of rows: {}".format( df.count() ) )









    



total num of rows: 28952



In [17]:

    
print ( "Number of partitions: {}".format(df.rdd.getNumPartitions()) )









    



Number of partitions: 2



In [18]:

    
print ( " Data Frame Schema: " )
df.printSchema()









    



 Data Frame Schema: 
root
 |-- device_key: string (nullable = true)
 |-- distance_um: string (nullable = true)
 |-- duration_seconds: double (nullable = true)
 |-- end_point_address_city: string (nullable = true)
 |-- end_point_address_country: string (nullable = true)
 |-- end_point_address_state: string (nullable = true)
 |-- end_point_address_zipcode: string (nullable = true)
 |-- end_point_daylight_saving_time_flag: boolean (nullable = true)
 |-- end_point_latitude: double (nullable = true)
 |-- end_point_longitude: double (nullable = true)
 |-- end_point_place: string (nullable = true)
 |-- end_point_timestamp: string (nullable = true)
 |-- end_point_timestamp_tz: string (nullable = true)
 |-- end_point_timestamp_utc: string (nullable = true)
 |-- fuel_consumed: string (nullable = true)
 |-- fuel_consumed_um: string (nullable = true)
 |-- fuel_cost: string (nullable = true)
 |-- fuel_cost_currency_code: string (nullable = true)
 |-- fuel_cost_currency_symbol: string (nullable = true)
 |-- fuel_ppg: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- gps_distance: double (nullable = true)
 |-- gps_miles: double (nullable = true)
 |-- hard_accel_count: long (nullable = true)
 |-- hard_brake_count: long (nullable = true)
 |-- idle_seconds: long (nullable = true)
 |-- key: string (nullable = true)
 |-- mpg_combined: string (nullable = true)
 |-- obd_distance: double (nullable = true)
 |-- obd_miles: double (nullable = true)
 |-- points_city_count: long (nullable = true)
 |-- points_hwy_count: long (nullable = true)
 |-- speed_um: string (nullable = true)
 |-- speeding_city_major_count: long (nullable = true)
 |-- speeding_city_minor_count: long (nullable = true)
 |-- speeding_hwy_major_count: long (nullable = true)
 |-- speeding_hwy_minor_count: long (nullable = true)
 |-- start_point_address_city: string (nullable = true)
 |-- start_point_address_country: string (nullable = true)
 |-- start_point_address_state: string (nullable = true)
 |-- start_point_address_zipcode: string (nullable = true)
 |-- start_point_daylight_saving_time_flag: boolean (nullable = true)
 |-- start_point_latitude: double (nullable = true)
 |-- start_point_longitude: double (nullable = true)
 |-- start_point_place: string (nullable = true)
 |-- start_point_timestamp: string (nullable = true)
 |-- start_point_timestamp_tz: string (nullable = true)
 |-- start_point_timestamp_utc: string (nullable = true)
 |-- static_map_url: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- top_speed: double (nullable = true)
 |-- top_speed_mph: double (nullable = true)
 |-- trip_segments: long (nullable = true)
 |-- user: string (nullable = true)
 |-- vehicle_key: string (nullable = true)
 |-- vehicle_nickname: string (nullable = true)



In [19]:

    
df.limit(10).toPandas()









    Out[19]:






  
    
      
      device_key
      distance_um
      duration_seconds
      end_point_address_city
      end_point_address_country
      end_point_address_state
      end_point_address_zipcode
      end_point_daylight_saving_time_flag
      end_point_latitude
      end_point_longitude
      end_point_place
      end_point_timestamp
      end_point_timestamp_tz
      end_point_timestamp_utc
      fuel_consumed
      fuel_consumed_um
      fuel_cost
      fuel_cost_currency_code
      fuel_cost_currency_symbol
      fuel_ppg
      fuel_type
      gps_distance
      gps_miles
      hard_accel_count
      hard_brake_count
      idle_seconds
      key
      mpg_combined
      obd_distance
      obd_miles
      points_city_count
      points_hwy_count
      speed_um
      speeding_city_major_count
      speeding_city_minor_count
      speeding_hwy_major_count
      speeding_hwy_minor_count
      start_point_address_city
      start_point_address_country
      start_point_address_state
      start_point_address_zipcode
      start_point_daylight_saving_time_flag
      start_point_latitude
      start_point_longitude
      start_point_place
      start_point_timestamp
      start_point_timestamp_tz
      start_point_timestamp_utc
      static_map_url
      tags
      top_speed
      top_speed_mph
      trip_segments
      user
      vehicle_key
      vehicle_nickname
    
  
  
    
      0
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      425.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732082
      -1.717613
      None
      2016-11-24 12:33:01
      GMT
      2016-11-24 12:33:01
      None
      gal
      None
      EUR
      €
      2.39
      regular
      0.747816
      0.747816
      0
      0
      238
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww
      None
      0.781064
      0.781064
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19 3BE
      False
      53.724965
      -1.715545
      None
      2016-11-24 12:25:56
      GMT
      2016-11-24 12:25:56
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48
      []
      32.3
      32.3
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      1
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      235.0
      Cleckheaton
      GB
      England
      BD19 3BJ
      False
      53.725047
      -1.715698
      None
      2016-11-24 12:12:05
      GMT
      2016-11-24 12:12:05
      None
      gal
      None
      EUR
      €
      2.39
      regular
      0.874673
      0.874673
      0
      0
      25
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw
      None
      0.884833
      0.884833
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732043
      -1.717593
      None
      2016-11-24 12:08:10
      GMT
      2016-11-24 12:08:10
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs
      []
      27.3
      27.3
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      2
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      2439.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732043
      -1.717585
      None
      2016-11-24 08:28:24
      GMT
      2016-11-24 08:28:24
      None
      gal
      None
      EUR
      €
      2.39
      regular
      11.283541
      11.283541
      3
      0
      541
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww
      None
      10.958502
      10.958502
      83
      0
      mph
      2
      1
      0
      0
      Luddenden Foot
      GB
      England
      HX2
      False
      53.723620
      -1.945880
      None
      2016-11-24 07:47:45
      GMT
      2016-11-24 07:47:45
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho
      []
      52.8
      52.8
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      3
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      738.0
      Luddenden Foot
      GB
      England
      HX2
      False
      53.723613
      -1.946213
      None
      2016-11-23 17:55:30
      GMT
      2016-11-23 17:55:30
      None
      gal
      None
      EUR
      €
      2.39
      regular
      3.563889
      3.563889
      0
      0
      117
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw
      None
      3.465387
      3.465387
      26
      0
      mph
      0
      0
      0
      0
      Halifax
      GB
      England
      HX1 3BA
      False
      53.714492
      -1.878072
      None
      2016-11-23 17:43:12
      GMT
      2016-11-23 17:43:12
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U
      []
      37.9
      37.9
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      4
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      1808.0
      Halifax
      GB
      England
      HX1 3BA
      False
      53.714430
      -1.878085
      None
      2016-11-23 17:32:16
      GMT
      2016-11-23 17:32:16
      None
      gal
      None
      EUR
      €
      2.39
      regular
      8.284672
      8.284672
      0
      0
      478
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw
      None
      8.056078
      8.056078
      58
      6
      mph
      0
      1
      0
      0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.731958
      -1.717408
      None
      2016-11-23 17:02:08
      GMT
      2016-11-23 17:02:08
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE
      []
      46.0
      46.0
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      5
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      241.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732083
      -1.717565
      None
      2016-11-23 13:50:00
      GMT
      2016-11-23 13:50:00
      None
      gal
      None
      EUR
      €
      2.39
      regular
      0.914472
      0.914472
      0
      0
      62
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww
      None
      0.917144
      0.917144
      11
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19 3BE
      False
      53.725010
      -1.715388
      None
      2016-11-23 13:45:59
      GMT
      2016-11-23 13:45:59
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww/staticmap?t=y0KEAxjFAOE5zvZr7dlueq_gCFo
      []
      36.0
      36.0
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      6
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      252.0
      Cleckheaton
      GB
      England
      BD19 3HU
      False
      53.725067
      -1.715293
      None
      2016-11-23 13:35:05
      GMT
      2016-11-23 13:35:05
      None
      gal
      None
      EUR
      €
      2.39
      regular
      0.918643
      0.918643
      0
      0
      85
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw
      None
      0.907202
      0.907202
      10
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19
      False
      53.732218
      -1.717352
      None
      2016-11-23 13:30:53
      GMT
      2016-11-23 13:30:53
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw/staticmap?t=d7G_lFT1s4HDZDck3HKofRbsU3U
      []
      35.4
      35.4
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      7
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      2366.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732067
      -1.717568
      None
      2016-11-23 08:27:53
      GMT
      2016-11-23 08:27:53
      None
      gal
      None
      EUR
      €
      2.39
      regular
      11.331648
      11.331648
      0
      1
      381
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww
      None
      10.952910
      10.952910
      80
      0
      mph
      1
      1
      0
      0
      Luddenden Foot
      GB
      England
      HX2
      False
      53.723613
      -1.946087
      None
      2016-11-23 07:48:27
      GMT
      2016-11-23 07:48:27
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww/staticmap?t=26FKe4-nnlFmtgSLe7BwwqbByWY
      []
      50.3
      50.3
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      8
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      526.0
      Luddenden Foot
      GB
      England
      HX2 6AH
      False
      53.723627
      -1.946223
      None
      2016-11-22 21:39:30
      GMT
      2016-11-22 21:39:30
      None
      gal
      None
      EUR
      €
      2.39
      regular
      3.576469
      3.576469
      0
      0
      61
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw
      None
      3.438047
      3.438047
      22
      0
      mph
      2
      1
      0
      0
      Halifax
      GB
      England
      HX1 3BA
      False
      53.714567
      -1.878020
      None
      2016-11-22 21:30:44
      GMT
      2016-11-22 21:30:44
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw/staticmap?t=Dk0uV27dMBIVyQDR05r_bayFUFg
      []
      44.1
      44.1
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      9
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      532.0
      Halifax
      GB
      England
      HX1 3BA
      False
      53.714572
      -1.878007
      None
      2016-11-22 21:25:20
      GMT
      2016-11-22 21:25:20
      None
      gal
      None
      EUR
      €
      2.39
      regular
      1.537632
      1.537632
      0
      0
      48
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw
      None
      1.559642
      1.559642
      20
      0
      mph
      1
      0
      0
      0
      Halifax
      GB
      England
      HX1 4TA
      False
      53.724657
      -1.887858
      None
      2016-11-22 21:16:28
      GMT
      2016-11-22 21:16:28
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw/staticmap?t=4GOnC6A4FNpOm4pd_UHYX3FMQDU
      []
      32.3
      32.3
      1
      None
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN



In [20]:

    
cols = ['user','start_point_place','end_point_place','mpg_combined','fuel_consumed','fuel_cost']

for col in cols:
  print ( "\'{}\' non NULL entries: {}".format(col,df.select(col).dropna().count()) )

print ( "\'tags\' non NULL entries: {}".format(df.select(F.explode('tags')).dropna().count())  )

  
cols.append('tags')

from functools import reduce
from pyspark.sql import DataFrame

df = reduce(DataFrame.drop, cols, df)

print ("\n Columns dropped ")









    



'user' non NULL entries: 0
'start_point_place' non NULL entries: 0
'end_point_place' non NULL entries: 0
'mpg_combined' non NULL entries: 0
'fuel_consumed' non NULL entries: 0
'fuel_cost' non NULL entries: 0
'tags' non NULL entries: 0

 Columns dropped



In [21]:

    
df.select('static_map_url').limit(5).toPandas() #show(5,truncate=False)

import urllib, requests #check first 10 urls
for url in [ c['static_map_url'] for c in df.limit(10).rdd.collect() ]:
  print (url)
  print (' satus code {}\n'.format( requests.get(url).status_code ))
  
df = df.drop('static_map_url')









    Out[21]:






  
    
      
      static_map_url
    
  
  
    
      0
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48
    
    
      1
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs
    
    
      2
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho
    
    
      3
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U
    
    
      4
      https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE
    
  








    



https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww/staticmap?t=y0KEAxjFAOE5zvZr7dlueq_gCFo
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw/staticmap?t=d7G_lFT1s4HDZDck3HKofRbsU3U
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww/staticmap?t=26FKe4-nnlFmtgSLe7BwwqbByWY
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw/staticmap?t=Dk0uV27dMBIVyQDR05r_bayFUFg
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw/staticmap?t=4GOnC6A4FNpOm4pd_UHYX3FMQDU
 satus code 401



In [22]:

    
pdf = df.toPandas()
df0 = df.dropna(how='any')

pdf = df0.toPandas()
pdf.shape
df0 = df0.dropDuplicates()
pdf.shape
pdf.drop_duplicates(inplace=True)
pdf.shape
df0.count()

df0.select('key','vehicle_nickname').groupBy('vehicle_nickname','key').count().count()









    Out[22]:





(21352, 48)






    Out[22]:





(21352, 48)






    Out[22]:





(3903, 48)






    Out[22]:





3903






    Out[22]:





3903



In [23]:

    
df1 = df.dropDuplicates()
df2 = df.dropna(how='any')
df.count()
df1.count()
df2.count()
df1.dropna(how='any').count()
df2.dropDuplicates().count()









    Out[23]:





28952






    Out[23]:





5295






    Out[23]:





21352






    Out[23]:





3903






    Out[23]:





3903



In [24]:

    
dfa = df.dropna(how='any')
dfa.count()









    Out[24]:





21352



In [25]:

    
df = df.dropDuplicates()
df.count()









    Out[25]:





5295



In [26]:

    
df.limit(5).toPandas()









    Out[26]:






  
    
      
      device_key
      distance_um
      duration_seconds
      end_point_address_city
      end_point_address_country
      end_point_address_state
      end_point_address_zipcode
      end_point_daylight_saving_time_flag
      end_point_latitude
      end_point_longitude
      end_point_timestamp
      end_point_timestamp_tz
      end_point_timestamp_utc
      fuel_consumed_um
      fuel_cost_currency_code
      fuel_cost_currency_symbol
      fuel_ppg
      fuel_type
      gps_distance
      gps_miles
      hard_accel_count
      hard_brake_count
      idle_seconds
      key
      obd_distance
      obd_miles
      points_city_count
      points_hwy_count
      speed_um
      speeding_city_major_count
      speeding_city_minor_count
      speeding_hwy_major_count
      speeding_hwy_minor_count
      start_point_address_city
      start_point_address_country
      start_point_address_state
      start_point_address_zipcode
      start_point_daylight_saving_time_flag
      start_point_latitude
      start_point_longitude
      start_point_timestamp
      start_point_timestamp_tz
      start_point_timestamp_utc
      top_speed
      top_speed_mph
      trip_segments
      vehicle_key
      vehicle_nickname
    
  
  
    
      0
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      425.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732082
      -1.717613
      2016-11-24 12:33:01
      GMT
      2016-11-24 12:33:01
      gal
      EUR
      €
      2.39
      regular
      0.747816
      0.747816
      0
      0
      238
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww
      0.781064
      0.781064
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19 3BE
      False
      53.724965
      -1.715545
      2016-11-24 12:25:56
      GMT
      2016-11-24 12:25:56
      32.3
      32.3
      1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      1
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      235.0
      Cleckheaton
      GB
      England
      BD19 3BJ
      False
      53.725047
      -1.715698
      2016-11-24 12:12:05
      GMT
      2016-11-24 12:12:05
      gal
      EUR
      €
      2.39
      regular
      0.874673
      0.874673
      0
      0
      25
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw
      0.884833
      0.884833
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732043
      -1.717593
      2016-11-24 12:08:10
      GMT
      2016-11-24 12:08:10
      27.3
      27.3
      1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      2
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      2439.0
      Cleckheaton
      GB
      England
      BD19 3TY
      False
      53.732043
      -1.717585
      2016-11-24 08:28:24
      GMT
      2016-11-24 08:28:24
      gal
      EUR
      €
      2.39
      regular
      11.283541
      11.283541
      3
      0
      541
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww
      10.958502
      10.958502
      83
      0
      mph
      2
      1
      0
      0
      Luddenden Foot
      GB
      England
      HX2
      False
      53.723620
      -1.945880
      2016-11-24 07:47:45
      GMT
      2016-11-24 07:47:45
      52.8
      52.8
      1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      3
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      526.0
      Luddenden Foot
      GB
      England
      HX2 6AH
      False
      53.723627
      -1.946223
      2016-11-22 21:39:30
      GMT
      2016-11-22 21:39:30
      gal
      EUR
      €
      2.39
      regular
      3.576469
      3.576469
      0
      0
      61
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw
      3.438047
      3.438047
      22
      0
      mph
      2
      1
      0
      0
      Halifax
      GB
      England
      HX1 3BA
      False
      53.714567
      -1.878020
      2016-11-22 21:30:44
      GMT
      2016-11-22 21:30:44
      44.1
      44.1
      1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      4
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      593.0
      Halifax
      GB
      England
      HX1 4TA
      False
      53.724555
      -1.887878
      2016-11-22 19:52:22
      GMT
      2016-11-22 19:52:22
      gal
      EUR
      €
      2.39
      regular
      4.350397
      4.350397
      0
      0
      35
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw
      4.279383
      4.279383
      30
      0
      mph
      1
      1
      0
      0
      Luddenden Foot
      GB
      England
      HX2 6AH
      False
      53.723640
      -1.946238
      2016-11-22 19:42:29
      GMT
      2016-11-22 19:42:29
      46.0
      46.0
      1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN

check that all missing value cut almost 25% so go check if there is a particular columns contributing more



In [27]:

    
numOfRows_all = df.count()

for c in df.columns:
  n = 'dummy' #df.where( F.col(c).isNull() ).count()
  print ("\'{}\' -- null counts: {}".format(c,n))

df = df.dropna(how='any')

numOfRows = df.count()

print ("\n Counts before row drop: {}\n".format(numOfRows_all))
print (" Counts after \"any null\" row drop: {0:.0f}, fraction: {1:.2f}\n".format(numOfRows,numOfRows*1./numOfRows_all))









    



'device_key' -- null counts: dummy
'distance_um' -- null counts: dummy
'duration_seconds' -- null counts: dummy
'end_point_address_city' -- null counts: dummy
'end_point_address_country' -- null counts: dummy
'end_point_address_state' -- null counts: dummy
'end_point_address_zipcode' -- null counts: dummy
'end_point_daylight_saving_time_flag' -- null counts: dummy
'end_point_latitude' -- null counts: dummy
'end_point_longitude' -- null counts: dummy
'end_point_timestamp' -- null counts: dummy
'end_point_timestamp_tz' -- null counts: dummy
'end_point_timestamp_utc' -- null counts: dummy
'fuel_consumed_um' -- null counts: dummy
'fuel_cost_currency_code' -- null counts: dummy
'fuel_cost_currency_symbol' -- null counts: dummy
'fuel_ppg' -- null counts: dummy
'fuel_type' -- null counts: dummy
'gps_distance' -- null counts: dummy
'gps_miles' -- null counts: dummy
'hard_accel_count' -- null counts: dummy
'hard_brake_count' -- null counts: dummy
'idle_seconds' -- null counts: dummy
'key' -- null counts: dummy
'obd_distance' -- null counts: dummy
'obd_miles' -- null counts: dummy
'points_city_count' -- null counts: dummy
'points_hwy_count' -- null counts: dummy
'speed_um' -- null counts: dummy
'speeding_city_major_count' -- null counts: dummy
'speeding_city_minor_count' -- null counts: dummy
'speeding_hwy_major_count' -- null counts: dummy
'speeding_hwy_minor_count' -- null counts: dummy
'start_point_address_city' -- null counts: dummy
'start_point_address_country' -- null counts: dummy
'start_point_address_state' -- null counts: dummy
'start_point_address_zipcode' -- null counts: dummy
'start_point_daylight_saving_time_flag' -- null counts: dummy
'start_point_latitude' -- null counts: dummy
'start_point_longitude' -- null counts: dummy
'start_point_timestamp' -- null counts: dummy
'start_point_timestamp_tz' -- null counts: dummy
'start_point_timestamp_utc' -- null counts: dummy
'top_speed' -- null counts: dummy
'top_speed_mph' -- null counts: dummy
'trip_segments' -- null counts: dummy
'vehicle_key' -- null counts: dummy
'vehicle_nickname' -- null counts: dummy

 Counts before row drop: 5295

 Counts after "any null" row drop: 3903, fraction: 0.74



In [28]:

    
df2 = df
df2.count()









    Out[28]:





3903



In [29]:

    
df.select('distance_um').distinct().show()
df.select('speed_um').distinct().show()
df.select('fuel_consumed_um').distinct().show()
df.select('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg').distinct().show()

df = df.drop('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg','fuel_consumed_um')









    



+-----------+
|distance_um|
+-----------+
|         mi|
+-----------+

+--------+
|speed_um|
+--------+
|     mph|
+--------+

+----------------+
|fuel_consumed_um|
+----------------+
|             gal|
+----------------+

+-----------------------+-------------------------+--------+
|fuel_cost_currency_code|fuel_cost_currency_symbol|fuel_ppg|
+-----------------------+-------------------------+--------+
|                    EUR|                        €|    2.39|
|                    USD|                        $|    3.54|
+-----------------------+-------------------------+--------+



In [30]:

    
df.select('top_speed','top_speed_mph').describe().show()
df.select('obd_distance','obd_miles').describe().show()
df.select('gps_distance','gps_miles').describe().show()

df = df.drop('top_speed_mph','obd_miles','gps_miles')

## use np.allclose no for large arrays









    



+-------+------------------+------------------+
|summary|         top_speed|     top_speed_mph|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean| 45.53637113548447| 45.53637113548447|
| stddev|20.040925556040115|20.040925556040115|
|    min|               0.0|               0.0|
|    max|127.38109440865345|127.38109440865345|
+-------+------------------+------------------+

+-------+------------------+------------------+
|summary|      obd_distance|         obd_miles|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean| 8.095386916834075| 8.095386916834075|
| stddev|16.386015432771394|16.386015432771394|
|    min|               0.0|               0.0|
|    max|232.86755348763222|232.86755348763222|
+-------+------------------+------------------+

+-------+--------------------+--------------------+
|summary|        gps_distance|           gps_miles|
+-------+--------------------+--------------------+
|  count|                3903|                3903|
|   mean|   7.257100007399007|   7.257100007399007|
| stddev|  15.855331245522413|  15.855331245522413|
|    min|8.006704572768198E-5|8.006704572768198E-5|
|    max|  237.73788151299107|  237.73788151299107|
+-------+--------------------+--------------------+



In [31]:

    
df.select('trip_segments').distinct().show() 
df.select('fuel_type').distinct().show()









    



+-------------+
|trip_segments|
+-------------+
|            1|
+-------------+

+---------+
|fuel_type|
+---------+
|  regular|
+---------+



In [32]:

    
df = df.drop('trip_segments','fuel_type')



In [33]:

    
df.select('end_point_address_country').distinct().show()
df.select('start_point_address_country').distinct().show()

df.select('start_point_address_state').distinct().show() #Cymru is the Welsh name of Wales
df.select('end_point_address_state').distinct().show()

df.select('start_point_address_state','end_point_address_state').distinct().show()

df = df.drop('start_point_address_country','end_point_address_country')









    



+-------------------------+
|end_point_address_country|
+-------------------------+
|                       GB|
+-------------------------+

+---------------------------+
|start_point_address_country|
+---------------------------+
|                         GB|
+---------------------------+

+-------------------------+
|start_point_address_state|
+-------------------------+
|                  England|
|                 Scotland|
|                    Wales|
|                    Cymru|
|         Northern Ireland|
+-------------------------+

+-----------------------+
|end_point_address_state|
+-----------------------+
|                England|
|               Scotland|
|                  Wales|
|       Northern Ireland|
+-----------------------+

+-------------------------+-----------------------+
|start_point_address_state|end_point_address_state|
+-------------------------+-----------------------+
|                    Wales|                  Wales|
|                  England|                England|
|                 Scotland|               Scotland|
|                    Cymru|                  Wales|
|         Northern Ireland|       Northern Ireland|
|                    Wales|                England|
|                  England|                  Wales|
+-------------------------+-----------------------+



In [34]:

    
print (" Num of records with with state adress = Cymru: {} ".format( df.filter(df['start_point_address_state']=='Cymru').count() ))









    



 Num of records with with state adress = Cymru: 1



In [35]:

    
df.filter(df['start_point_address_state']=='Cymru').toPandas().head()









    Out[35]:






  
    
      
      device_key
      distance_um
      duration_seconds
      end_point_address_city
      end_point_address_state
      end_point_address_zipcode
      end_point_daylight_saving_time_flag
      end_point_latitude
      end_point_longitude
      end_point_timestamp
      end_point_timestamp_tz
      end_point_timestamp_utc
      gps_distance
      hard_accel_count
      hard_brake_count
      idle_seconds
      key
      obd_distance
      points_city_count
      points_hwy_count
      speed_um
      speeding_city_major_count
      speeding_city_minor_count
      speeding_hwy_major_count
      speeding_hwy_minor_count
      start_point_address_city
      start_point_address_state
      start_point_address_zipcode
      start_point_daylight_saving_time_flag
      start_point_latitude
      start_point_longitude
      start_point_timestamp
      start_point_timestamp_tz
      start_point_timestamp_utc
      top_speed
      vehicle_key
      vehicle_nickname
    
  
  
    
      0
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk5MTjJrdFZrQUNpZEJLZ3FHbWFFaGUM
      mi
      1661.0
      Bridgend
      Wales
      CF32 9SU
      False
      51.531565
      -3.579452
      2016-11-21 12:45:18
      GMT
      2016-11-21 12:45:18
      21.130669
      0
      0
      51
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk5MTjJrdFZrQUNpZEJLZ3FHbWFFaGUMCxIEVHJpcCIWQ2dxNWZ2WjNLR1hTYWoyU282Y2tDegw
      21.094309
      54
      1
      mph
      16
      5
      0
      0
      Caerphilly
      Cymru
      CF83
      False
      51.5678
      -3.231417
      2016-11-21 12:17:37
      GMT
      2016-11-21 12:17:37
      72.7
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZKblAyYmE0cTM5OVAzSGNhbTRiUTdCDAsSA0NhciILNW9xNHV5UHJIUHgM
      KS12VRN



In [36]:

    
df = df.replace('Cymru','Wales')



In [37]:

    
df.limit(5).toPandas().head()









    Out[37]:






  
    
      
      device_key
      distance_um
      duration_seconds
      end_point_address_city
      end_point_address_state
      end_point_address_zipcode
      end_point_daylight_saving_time_flag
      end_point_latitude
      end_point_longitude
      end_point_timestamp
      end_point_timestamp_tz
      end_point_timestamp_utc
      gps_distance
      hard_accel_count
      hard_brake_count
      idle_seconds
      key
      obd_distance
      points_city_count
      points_hwy_count
      speed_um
      speeding_city_major_count
      speeding_city_minor_count
      speeding_hwy_major_count
      speeding_hwy_minor_count
      start_point_address_city
      start_point_address_state
      start_point_address_zipcode
      start_point_daylight_saving_time_flag
      start_point_latitude
      start_point_longitude
      start_point_timestamp
      start_point_timestamp_tz
      start_point_timestamp_utc
      top_speed
      vehicle_key
      vehicle_nickname
    
  
  
    
      0
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      425.0
      Cleckheaton
      England
      BD19 3TY
      False
      53.732082
      -1.717613
      2016-11-24 12:33:01
      GMT
      2016-11-24 12:33:01
      0.747816
      0
      0
      238
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww
      0.781064
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      England
      BD19 3BE
      False
      53.724965
      -1.715545
      2016-11-24 12:25:56
      GMT
      2016-11-24 12:25:56
      32.3
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      1
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      235.0
      Cleckheaton
      England
      BD19 3BJ
      False
      53.725047
      -1.715698
      2016-11-24 12:12:05
      GMT
      2016-11-24 12:12:05
      0.874673
      0
      0
      25
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw
      0.884833
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      England
      BD19 3TY
      False
      53.732043
      -1.717593
      2016-11-24 12:08:10
      GMT
      2016-11-24 12:08:10
      27.3
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      2
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      2439.0
      Cleckheaton
      England
      BD19 3TY
      False
      53.732043
      -1.717585
      2016-11-24 08:28:24
      GMT
      2016-11-24 08:28:24
      11.283541
      3
      0
      541
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww
      10.958502
      83
      0
      mph
      2
      1
      0
      0
      Luddenden Foot
      England
      HX2
      False
      53.723620
      -1.945880
      2016-11-24 07:47:45
      GMT
      2016-11-24 07:47:45
      52.8
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      3
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      526.0
      Luddenden Foot
      England
      HX2 6AH
      False
      53.723627
      -1.946223
      2016-11-22 21:39:30
      GMT
      2016-11-22 21:39:30
      3.576469
      0
      0
      61
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw
      3.438047
      22
      0
      mph
      2
      1
      0
      0
      Halifax
      England
      HX1 3BA
      False
      53.714567
      -1.878020
      2016-11-22 21:30:44
      GMT
      2016-11-22 21:30:44
      44.1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      4
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      593.0
      Halifax
      England
      HX1 4TA
      False
      53.724555
      -1.887878
      2016-11-22 19:52:22
      GMT
      2016-11-22 19:52:22
      4.350397
      0
      0
      35
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw
      4.279383
      30
      0
      mph
      1
      1
      0
      0
      Luddenden Foot
      England
      HX2 6AH
      False
      53.723640
      -1.946238
      2016-11-22 19:42:29
      GMT
      2016-11-22 19:42:29
      46.0
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN



In [38]:

    
df.groupBy('start_point_timestamp_tz').pivot('end_point_timestamp_tz').count().show()

df.groupBy('start_point_timestamp_tz').pivot('start_point_daylight_saving_time_flag').count().show()
df.groupBy('end_point_timestamp_tz').pivot('end_point_daylight_saving_time_flag').count().show()


df =  df.drop('start_point_daylight_saving_time_flag','end_point_daylight_saving_time_flag')\
        .drop('end_point_timestamp_tz')

df = df.withColumnRenamed('start_point_timestamp_tz','trip_tz')









    



+------------------------+----+----+
|start_point_timestamp_tz| BST| GMT|
+------------------------+----+----+
|                     GMT|null|3031|
|                     BST| 872|null|
+------------------------+----+----+

+------------------------+-----+----+
|start_point_timestamp_tz|false|true|
+------------------------+-----+----+
|                     GMT| 3031|null|
|                     BST| null| 872|
+------------------------+-----+----+

+----------------------+-----+----+
|end_point_timestamp_tz|false|true|
+----------------------+-----+----+
|                   GMT| 3031|null|
|                   BST| null| 872|
+----------------------+-----+----+



In [39]:

    
from pyspark.sql.types import TimestampType

def colsToTimestamp(df,colnames,format):
  for name in colnames:
    df = df.withColumn(name,F.unix_timestamp(df[name],timestamp_format).cast(TimestampType()))
  return df
  
timestamp_format = 'yyyy-MM-dd HH:mm:ss'
timestamp_cols   = ['start_point_timestamp','start_point_timestamp_utc','end_point_timestamp','end_point_timestamp_utc']

df = colsToTimestamp(df,timestamp_cols,timestamp_format) 

print (" check types schema: ")
print ([(c.name,c.dataType) for c in df.schema if c.name in timestamp_cols])









    



 check types schema: 
[('end_point_timestamp', TimestampType), ('end_point_timestamp_utc', TimestampType), ('start_point_timestamp', TimestampType), ('start_point_timestamp_utc', TimestampType)]



In [40]:

    
df.limit(5).toPandas().head()









    Out[40]:






  
    
      
      device_key
      distance_um
      duration_seconds
      end_point_address_city
      end_point_address_state
      end_point_address_zipcode
      end_point_latitude
      end_point_longitude
      end_point_timestamp
      end_point_timestamp_utc
      gps_distance
      hard_accel_count
      hard_brake_count
      idle_seconds
      key
      obd_distance
      points_city_count
      points_hwy_count
      speed_um
      speeding_city_major_count
      speeding_city_minor_count
      speeding_hwy_major_count
      speeding_hwy_minor_count
      start_point_address_city
      start_point_address_state
      start_point_address_zipcode
      start_point_latitude
      start_point_longitude
      start_point_timestamp
      trip_tz
      start_point_timestamp_utc
      top_speed
      vehicle_key
      vehicle_nickname
    
  
  
    
      0
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      425.0
      Cleckheaton
      England
      BD19 3TY
      53.732082
      -1.717613
      2016-11-24 12:33:01
      2016-11-24 12:33:01
      0.747816
      0
      0
      238
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww
      0.781064
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      England
      BD19 3BE
      53.724965
      -1.715545
      2016-11-24 12:25:56
      GMT
      2016-11-24 12:25:56
      32.3
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      1
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      235.0
      Cleckheaton
      England
      BD19 3BJ
      53.725047
      -1.715698
      2016-11-24 12:12:05
      2016-11-24 12:12:05
      0.874673
      0
      0
      25
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw
      0.884833
      12
      0
      mph
      0
      0
      0
      0
      Cleckheaton
      England
      BD19 3TY
      53.732043
      -1.717593
      2016-11-24 12:08:10
      GMT
      2016-11-24 12:08:10
      27.3
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      2
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      2439.0
      Cleckheaton
      England
      BD19 3TY
      53.732043
      -1.717585
      2016-11-24 08:28:24
      2016-11-24 08:28:24
      11.283541
      3
      0
      541
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww
      10.958502
      83
      0
      mph
      2
      1
      0
      0
      Luddenden Foot
      England
      HX2
      53.723620
      -1.945880
      2016-11-24 07:47:45
      GMT
      2016-11-24 07:47:45
      52.8
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      3
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      526.0
      Luddenden Foot
      England
      HX2 6AH
      53.723627
      -1.946223
      2016-11-22 21:39:30
      2016-11-22 21:39:30
      3.576469
      0
      0
      61
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw
      3.438047
      22
      0
      mph
      2
      1
      0
      0
      Halifax
      England
      HX1 3BA
      53.714567
      -1.878020
      2016-11-22 21:30:44
      GMT
      2016-11-22 21:30:44
      44.1
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN
    
    
      4
      agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM
      mi
      593.0
      Halifax
      England
      HX1 4TA
      53.724555
      -1.887878
      2016-11-22 19:52:22
      2016-11-22 19:52:22
      4.350397
      0
      0
      35
      agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw
      4.279383
      30
      0
      mph
      1
      1
      0
      0
      Luddenden Foot
      England
      HX2 6AH
      53.723640
      -1.946238
      2016-11-22 19:42:29
      GMT
      2016-11-22 19:42:29
      46.0
      agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM
      MV13ZFN



In [41]:

    
col = 'start_point_timestamp'

df.select(col,'trip_tz' ) \
       .groupBy(F.year(col),F.month(col),F.dayofmonth(col),'trip_tz')\
       .count()\
       .orderBy(F.year(col),F.month(col),F.dayofmonth(col)) \
       .toPandas() #.show(n=365,truncate=False)









    



+---------------------------+----------------------------+---------------------------------+-------+-----+
|year(start_point_timestamp)|month(start_point_timestamp)|dayofmonth(start_point_timestamp)|trip_tz|count|
+---------------------------+----------------------------+---------------------------------+-------+-----+
|2016                       |8                           |10                               |BST    |1    |
|2016                       |8                           |13                               |BST    |2    |
|2016                       |8                           |14                               |BST    |1    |
|2016                       |8                           |17                               |BST    |2    |
|2016                       |8                           |18                               |BST    |2    |
|2016                       |8                           |19                               |BST    |1    |
|2016                       |8                           |21                               |BST    |2    |
|2016                       |8                           |23                               |BST    |2    |
|2016                       |8                           |27                               |BST    |4    |
|2016                       |8                           |28                               |BST    |2    |
|2016                       |8                           |29                               |BST    |1    |
|2016                       |8                           |30                               |BST    |2    |
|2016                       |9                           |1                                |BST    |5    |
|2016                       |9                           |2                                |BST    |2    |
|2016                       |9                           |3                                |BST    |2    |
|2016                       |9                           |4                                |BST    |5    |
|2016                       |9                           |5                                |BST    |4    |
|2016                       |9                           |6                                |BST    |5    |
|2016                       |9                           |7                                |BST    |4    |
|2016                       |9                           |8                                |BST    |8    |
|2016                       |9                           |9                                |BST    |6    |
|2016                       |9                           |10                               |BST    |3    |
|2016                       |9                           |11                               |BST    |4    |
|2016                       |9                           |12                               |BST    |3    |
|2016                       |9                           |13                               |BST    |5    |
|2016                       |9                           |14                               |BST    |6    |
|2016                       |9                           |15                               |BST    |20   |
|2016                       |9                           |16                               |BST    |42   |
|2016                       |9                           |17                               |BST    |67   |
|2016                       |9                           |18                               |BST    |52   |
|2016                       |9                           |19                               |BST    |38   |
|2016                       |9                           |20                               |BST    |46   |
|2016                       |9                           |21                               |BST    |19   |
|2016                       |9                           |23                               |BST    |2    |
|2016                       |9                           |24                               |BST    |12   |
|2016                       |9                           |25                               |BST    |7    |
|2016                       |9                           |26                               |BST    |4    |
|2016                       |9                           |27                               |BST    |3    |
|2016                       |9                           |29                               |BST    |5    |
|2016                       |9                           |30                               |BST    |1    |
|2016                       |10                          |1                                |BST    |12   |
|2016                       |10                          |2                                |BST    |3    |
|2016                       |10                          |3                                |BST    |5    |
|2016                       |10                          |4                                |BST    |2    |
|2016                       |10                          |5                                |BST    |2    |
|2016                       |10                          |6                                |BST    |10   |
|2016                       |10                          |8                                |BST    |1    |
|2016                       |10                          |11                               |BST    |5    |
|2016                       |10                          |13                               |BST    |4    |
|2016                       |10                          |15                               |BST    |11   |
|2016                       |10                          |16                               |BST    |13   |
|2016                       |10                          |17                               |BST    |14   |
|2016                       |10                          |18                               |BST    |13   |
|2016                       |10                          |19                               |BST    |18   |
|2016                       |10                          |20                               |BST    |5    |
|2016                       |10                          |21                               |BST    |16   |
|2016                       |10                          |22                               |BST    |46   |
|2016                       |10                          |23                               |BST    |32   |
|2016                       |10                          |24                               |BST    |21   |
|2016                       |10                          |25                               |BST    |15   |
|2016                       |10                          |26                               |BST    |28   |
|2016                       |10                          |27                               |BST    |48   |
|2016                       |10                          |28                               |BST    |60   |
|2016                       |10                          |29                               |BST    |86   |
|2016                       |10                          |30                               |GMT    |63   |
|2016                       |10                          |31                               |GMT    |81   |
|2016                       |11                          |1                                |GMT    |60   |
|2016                       |11                          |2                                |GMT    |105  |
|2016                       |11                          |3                                |GMT    |92   |
|2016                       |11                          |4                                |GMT    |77   |
|2016                       |11                          |5                                |GMT    |99   |
|2016                       |11                          |6                                |GMT    |87   |
|2016                       |11                          |7                                |GMT    |79   |
|2016                       |11                          |8                                |GMT    |77   |
|2016                       |11                          |9                                |GMT    |82   |
|2016                       |11                          |10                               |GMT    |79   |
|2016                       |11                          |11                               |GMT    |99   |
|2016                       |11                          |12                               |GMT    |97   |
|2016                       |11                          |13                               |GMT    |81   |
|2016                       |11                          |14                               |GMT    |62   |
|2016                       |11                          |15                               |GMT    |70   |
|2016                       |11                          |16                               |GMT    |89   |
|2016                       |11                          |17                               |GMT    |89   |
|2016                       |11                          |18                               |GMT    |96   |
|2016                       |11                          |19                               |GMT    |92   |
|2016                       |11                          |20                               |GMT    |80   |
|2016                       |11                          |21                               |GMT    |96   |
|2016                       |11                          |22                               |GMT    |61   |
|2016                       |11                          |23                               |GMT    |74   |
|2016                       |11                          |24                               |GMT    |58   |
|2016                       |11                          |25                               |GMT    |65   |
|2016                       |11                          |26                               |GMT    |80   |
|2016                       |11                          |27                               |GMT    |50   |
|2016                       |11                          |28                               |GMT    |65   |
|2016                       |11                          |29                               |GMT    |49   |
|2016                       |11                          |30                               |GMT    |78   |
|2016                       |12                          |1                                |GMT    |65   |
|2016                       |12                          |2                                |GMT    |60   |
|2016                       |12                          |3                                |GMT    |80   |
|2016                       |12                          |4                                |GMT    |55   |
|2016                       |12                          |5                                |GMT    |40   |
|2016                       |12                          |6                                |GMT    |34   |
|2016                       |12                          |7                                |GMT    |40   |
|2016                       |12                          |8                                |GMT    |48   |
|2016                       |12                          |9                                |GMT    |23   |
|2016                       |12                          |10                               |GMT    |34   |
|2016                       |12                          |11                               |GMT    |19   |
|2016                       |12                          |12                               |GMT    |12   |
|2016                       |12                          |13                               |GMT    |9    |
+---------------------------+----------------------------+---------------------------------+-------+-----+



In [42]:

    
timing_cols = [n for n in df.columns if 'timestamp' in n]

timing_cols.append('trip_tz')
timing_cols.append('duration_seconds')

df.select( [c for c in timing_cols] ) \
  .withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
  .limit(5).toPandas().head() #(.show(5,truncate=False)
  

df.select( [c for c in timing_cols] ) \
  .withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
  .describe('duration_seconds','mydt') \
  .show() 

df = df.drop('start_point_timestamp_utc','end_point_timestamp_utc')









    Out[42]:






  
    
      
      end_point_timestamp
      end_point_timestamp_utc
      start_point_timestamp
      start_point_timestamp_utc
      trip_tz
      duration_seconds
      mydt
    
  
  
    
      0
      2016-11-24 12:33:01
      2016-11-24 12:33:01
      2016-11-24 12:25:56
      2016-11-24 12:25:56
      GMT
      425.0
      425
    
    
      1
      2016-11-24 12:12:05
      2016-11-24 12:12:05
      2016-11-24 12:08:10
      2016-11-24 12:08:10
      GMT
      235.0
      235
    
    
      2
      2016-11-24 08:28:24
      2016-11-24 08:28:24
      2016-11-24 07:47:45
      2016-11-24 07:47:45
      GMT
      2439.0
      2439
    
    
      3
      2016-11-22 21:39:30
      2016-11-22 21:39:30
      2016-11-22 21:30:44
      2016-11-22 21:30:44
      GMT
      526.0
      526
    
    
      4
      2016-11-22 19:52:22
      2016-11-22 19:52:22
      2016-11-22 19:42:29
      2016-11-22 19:42:29
      GMT
      593.0
      593
    
  








    



+-------+------------------+------------------+
|summary|  duration_seconds|              mydt|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean|1164.3219336407892| 1164.322316167051|
| stddev|1374.7726984147887|1374.7725450494704|
|    min|              18.0|                18|
|    max|           17805.0|             17805|
+-------+------------------+------------------+



In [43]:

    
df = df.withColumnRenamed('start_point_latitude','start_point_lat') \
  .withColumnRenamed('start_point_longitude','start_point_long') \
  .withColumnRenamed('end_point_latitude','end_point_lat') \
  .withColumnRenamed('end_point_longitude','end_point_long')



In [44]:

    
latlong_cols = ['start_point_lat','start_point_long','end_point_lat','end_point_long']

timestamp_cols = [n for n in df.columns if 'timestamp' in n]

distance_cols = ['gps_distance','obd_distance']

duration_cols = ['duration_seconds','idle_seconds']



In [ ]:

    
cols = timestamp_cols + duration_cols + latlong_cols + ['gps_distance']

print (" number of rows where idle time  >= trip duration: {}"\
 .format( df.where(df['idle_seconds'] >= df['duration_seconds']).count() )
      )
  
print (" number of rows where idle time  < 0: {}" \
 .format( df.where(df['idle_seconds'] < 0).count() )
      )

df.select(cols)\
  .where( df['idle_seconds'] >= df['duration_seconds'] )\
  .toPandas() #.show(truncate=False)

df.select(cols)\
  .where( df['idle_seconds'] < 0 )\
  .toPandas() #.show(truncate=False)
  
df = df.where( (df['duration_seconds'] > df['idle_seconds']) & (df['idle_seconds'] >= 0) )


print ("rows after selection: {}".format(df.count()))









    



 number of rows where idle time  >= trip duration: 23
 number of rows where idle time  < 0: 1






    Out[ ]:






  
    
      
      end_point_timestamp
      start_point_timestamp
      duration_seconds
      idle_seconds
      start_point_lat
      start_point_long
      end_point_lat
      end_point_long
      gps_distance
    
  
  
    
      0
      2016-11-24 17:42:54
      2016-11-24 17:42:11
      43.0
      43
      51.433437
      -1.075727
      51.433443
      -1.075727
      0.000414
    
    
      1
      2016-11-23 10:00:57
      2016-11-23 10:00:28
      29.0
      29
      53.302647
      -3.542960
      53.302728
      -3.542902
      0.000721
    
    
      2
      2016-10-29 11:26:59
      2016-10-29 11:26:34
      25.0
      25
      51.136675
      -0.032290
      51.136668
      -0.032273
      0.000455
    
    
      3
      2016-11-10 10:42:34
      2016-11-10 10:42:12
      22.0
      22
      50.811335
      -0.772648
      50.811342
      -0.772757
      0.004693
    
    
      4
      2016-11-10 20:54:07
      2016-11-10 20:53:30
      37.0
      37
      53.859813
      -1.511138
      53.859795
      -1.511058
      0.003533
    
    
      5
      2016-09-16 12:38:17
      2016-09-16 12:34:30
      227.0
      227
      55.958752
      -3.123000
      55.958738
      -3.123000
      0.000574
    
    
      6
      2016-10-23 11:32:24
      2016-10-23 11:31:49
      35.0
      35
      53.037577
      -2.169860
      53.037527
      -2.169688
      0.009418
    
    
      7
      2016-11-30 19:08:07
      2016-11-30 19:07:33
      34.0
      34
      51.182787
      -0.025903
      51.182825
      -0.025842
      0.003424
    
    
      8
      2016-11-27 21:02:36
      2016-11-27 21:01:05
      91.0
      91
      51.182827
      -0.025885
      51.182792
      -0.025848
      0.001896
    
    
      9
      2016-09-29 17:43:42
      2016-09-29 17:41:21
      141.0
      141
      51.147028
      -0.976982
      51.147022
      -0.976948
      0.001428
    
    
      10
      2016-10-27 22:27:13
      2016-10-27 22:26:55
      18.0
      18
      53.690805
      -2.326915
      53.690720
      -2.327162
      0.012267
    
    
      11
      2016-11-21 17:19:44
      2016-11-21 17:19:09
      35.0
      35
      51.637958
      -0.468452
      51.637945
      -0.468437
      0.000519
    
    
      12
      2016-10-31 14:22:57
      2016-10-31 14:14:34
      503.0
      503
      51.277420
      -0.209937
      51.277400
      -0.209873
      0.002766
    
    
      13
      2016-09-18 18:53:03
      2016-09-18 18:51:32
      91.0
      91
      53.037508
      -2.169713
      53.037582
      -2.169957
      0.010673
    
    
      14
      2016-09-18 09:17:55
      2016-09-18 09:16:39
      76.0
      76
      53.037603
      -2.169882
      53.037527
      -2.169740
      0.008271
    
    
      15
      2016-11-13 10:16:33
      2016-11-13 10:15:56
      37.0
      37
      53.859902
      -1.511110
      53.859818
      -1.511037
      0.007509
    
    
      16
      2016-11-03 11:40:35
      2016-11-03 11:36:04
      271.0
      531488141
      53.786428
      -1.543668
      53.786455
      -1.543710
      0.840600
    
    
      17
      2016-11-11 12:45:16
      2016-11-11 12:44:39
      37.0
      37
      54.565188
      -5.979235
      54.565170
      -5.979213
      0.000080
    
    
      18
      2016-11-15 19:40:34
      2016-11-15 19:39:51
      43.0
      43
      51.536697
      -1.024128
      51.536788
      -1.024143
      0.005744
    
    
      19
      2016-09-24 06:31:02
      2016-09-24 06:29:01
      121.0
      121
      51.107595
      -1.045262
      51.107607
      -1.045183
      0.004769
    
    
      20
      2016-10-29 12:14:50
      2016-10-29 12:00:22
      868.0
      868
      55.958568
      -3.122465
      55.958533
      -3.122478
      0.003131
    
    
      21
      2016-10-29 15:44:01
      2016-10-29 15:43:36
      25.0
      25
      52.779013
      -0.175273
      52.778958
      -0.175177
      0.001937
    
    
      22
      2016-10-19 13:02:40
      2016-10-19 13:01:14
      86.0
      86
      51.107705
      -1.044980
      51.107572
      -1.045165
      0.014811
    
  








    Out[ ]:






  
    
      
      end_point_timestamp
      start_point_timestamp
      duration_seconds
      idle_seconds
      start_point_lat
      start_point_long
      end_point_lat
      end_point_long
      gps_distance
    
  
  
    
      0
      2016-10-31 10:24:47
      2016-10-31 09:57:11
      1656.0
      -3326
      51.074132
      -1.771637
      51.074427
      -1.770632
      0.105285
    
  








    



rows after selection: 3879



In [ ]:

    
df2 = df
df2.count()



In [ ]:

    
df.select(latlong_cols)\
  .describe()\
  .show()



In [ ]:

    
df.select('top_speed')\
  .describe()\
  .toPandas() #.show()

df.select( [c for c in df.columns if c.startswith('speeding')] )\
  .describe()\
  .show()

df.select( [c for c in df.columns if 'brake' in c ] )\
  .describe()\
  .show()
  
df.select( [c for c in df.columns if 'accel' in c] )\
  .describe()\
  .show()
  

df.select( [c for c in df.columns if c.startswith('points')] )\
  .describe()\
  .show()



In [ ]:

    
schema_cols = [(c.name,c.dataType) for c in df.schema]

for c in schema_cols:
  name,dtype = c
  if  dtype == DoubleType():
    df = df.withColumn(name,df[name].cast(FloatType()))
  elif dtype == LongType():
    df = df.withColumn(name,df[name].cast(IntegerType()))
    

df = df.withColumn('duration_seconds',df['duration_seconds'].cast(IntegerType()))

df.printSchema()



In [ ]:

    
df.limit(5).toPandas().head()



In [ ]:

    
print (" key entries: {}"\
                      .format( df.select('key').distinct().count() )
      )

df = df.drop('key')



In [ ]:

    
print (" Number of device_key: {}"\
                               .format( df.select('device_key').distinct().count() ) 
      )



In [ ]:

    
print ("Number of unique vehicle_key, vehicle_nickname: ",\
       df.select('vehicle_key','vehicle_nickname').groupBy('vehicle_key','vehicle_nickname').count().count()
      )

df.select('vehicle_key','vehicle_nickname').groupBy('vehicle_key','vehicle_nickname').count().orderBy('vehicle_nickname').show(n=100,truncate=False)



In [ ]:

    
print ("Number of unique device_key, vehicle_nickname: ",\
       df.select('device_key','vehicle_nickname').groupBy('device_key','vehicle_nickname').count().count()
      )

df.select('device_key','vehicle_nickname').groupBy('device_key','vehicle_nickname').count().orderBy('vehicle_nickname').show(n=100,truncate=False)



In [ ]:

    
device_keys = df.select('device_key').distinct().orderBy('device_key').rdd.flatMap(lambda x:x).collect()

getID_udf = F.udf(lambda k: device_keys.index(k), IntegerType())

df = df.withColumn('device_id',getID_udf(df['device_key']))



In [ ]:

    
vehicle_names = df.select('vehicle_nickname').distinct().orderBy('vehicle_nickname').rdd.flatMap(lambda x:x).collect()

getID_udf = F.udf(lambda k: vehicle_names.index(k), IntegerType())

df = df.withColumn('vehicle_id',getID_udf(df['vehicle_nickname']))



In [ ]:

    
df = df.drop('device_key')
df = df.drop('vehicle_key')



In [ ]:

    
df.printSchema()



In [ ]:

    
for c in [c for c in df.columns if 'address' in c]:
  df = df.withColumnRenamed(c,c.replace('address_',''))

df.columns



In [ ]:

    
cols = ['device_id','vehicle_id','vehicle_nickname'] \
       + [c for c in df.columns if c.startswith('start')] \
       + [c for c in df.columns if c.startswith('end')] \
       + ['trip_tz'] \
       + ['duration_seconds','idle_seconds'] \
       + ['gps_distance','obd_distance'] \
       + ['top_speed'] \
       + ['hard_accel_count','hard_brake_count'] \
       + ['points_city_count','points_hwy_count'] \
       + [c for c in df.columns if c.startswith('speeding')] \
       + ['speed_um','distance_um']
       
len(cols)
len(df.columns)



In [ ]:

    
df = df.select(cols)
df = df.orderBy('start_point_timestamp','vehicle_id')
df.count()



In [ ]:

    
df.limit(50).toPandas()



In [ ]:

    
df.select('vehicle_nickname').distinct().toPandas()['vehicle_nickname'].values.tolist()



In [ ]:

    
cols =   ['device_id','vehicle_id']\
       + ['start_point_lat','start_point_long'] \
       + ['end_point_lat','end_point_long'] \
       + ['duration_seconds','idle_seconds'] \
       + ['gps_distance','obd_distance'] \
       + ['top_speed'] \
       + ['hard_accel_count','hard_brake_count'] \
       + ['points_city_count','points_hwy_count'] \
       + ['speeding_city_major_count','speeding_city_minor_count']\
       + ['speeding_hwy_major_count','speeding_hwy_minor_count'] \
       + ['start_point_timestamp','end_point_timestamp']

df_num = df.select(cols)\
         .orderBy('start_point_timestamp','vehicle_id')\
         .withColumn('start_ts',F.unix_timestamp('start_point_timestamp'))\
         .withColumn('start_y',F.year('start_point_timestamp'))\
         .withColumn('start_m',F.month('start_point_timestamp'))\
         .withColumn('start_day',F.dayofmonth('start_point_timestamp'))\
         .withColumn('start_day2',F.dayofyear('start_point_timestamp'))\
         .withColumn('start_hh',F.hour('start_point_timestamp'))\
         .withColumn('start_mm',F.minute('start_point_timestamp'))\
         .withColumn('start_ss',F.second('start_point_timestamp'))\
         .withColumn('end_ts',F.unix_timestamp('end_point_timestamp'))\
         .withColumn('end_y',F.year('end_point_timestamp'))\
         .withColumn('end_m',F.month('end_point_timestamp'))\
         .withColumn('end_day',F.dayofmonth('end_point_timestamp'))\
         .withColumn('end_day2',F.dayofyear('end_point_timestamp'))\
         .withColumn('end_hh',F.hour('end_point_timestamp'))\
         .withColumn('end_mm',F.minute('end_point_timestamp'))\
         .withColumn('end_ss',F.second('end_point_timestamp'))
        
df_num = df_num.drop('start_point_timestamp','end_point_timestamp')



In [ ]:

    
df_num.limit(50).toPandas()



In [ ]:

    
df.limit(50).toPandas()



In [ ]:

    
pdf = df.toPandas()
pdf.to_pickle('zubie.pkl')



In [ ]:

    
pdf0 = pd.read_pickle('zubie.pkl')
pdf0.dtypes



In [ ]:

    
! rm -rf ./output
df.repartition(1).write.parquet('./output')



In [ ]:

	device_key	distance_um	duration_seconds	end_point_address_city	end_point_address_country	end_point_address_state	end_point_address_zipcode	end_point_daylight_saving_time_flag	end_point_latitude	end_point_longitude	end_point_place	end_point_timestamp	end_point_timestamp_tz	end_point_timestamp_utc	fuel_consumed	fuel_consumed_um	fuel_cost	fuel_cost_currency_code	fuel_cost_currency_symbol	fuel_ppg	fuel_type	gps_distance	gps_miles	hard_accel_count	hard_brake_count	idle_seconds	key	mpg_combined	obd_distance	obd_miles	points_city_count	points_hwy_count	speed_um	speeding_city_major_count	speeding_city_minor_count	start_point_address_city	start_point_address_country	start_point_address_state	start_point_address_zipcode	start_point_daylight_saving_time_flag	start_point_latitude	start_point_longitude	start_point_place	start_point_timestamp	start_point_timestamp_tz	start_point_timestamp_utc	static_map_url	tags	top_speed	top_speed_mph	trip_segments	user	vehicle_key	vehicle_nickname
0	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	425.0	Cleckheaton	GB	England	BD19 3TY	False	53.732082	-1.717613	None	2016-11-24 12:33:01	GMT	2016-11-24 12:33:01	None	gal	None	EUR	€	2.39	regular	0.747816	0.747816	0	0	238	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww	None	0.781064	0.781064	12	0	mph	0	0	Cleckheaton	GB	England	BD19 3BE	False	53.724965	-1.715545	None	2016-11-24 12:25:56	GMT	2016-11-24 12:25:56	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48	[]	32.3	32.3	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
1	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	235.0	Cleckheaton	GB	England	BD19 3BJ	False	53.725047	-1.715698	None	2016-11-24 12:12:05	GMT	2016-11-24 12:12:05	None	gal	None	EUR	€	2.39	regular	0.874673	0.874673	0	0	25	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw	None	0.884833	0.884833	12	0	mph	0	0	Cleckheaton	GB	England	BD19 3TY	False	53.732043	-1.717593	None	2016-11-24 12:08:10	GMT	2016-11-24 12:08:10	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs	[]	27.3	27.3	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
2	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	2439.0	Cleckheaton	GB	England	BD19 3TY	False	53.732043	-1.717585	None	2016-11-24 08:28:24	GMT	2016-11-24 08:28:24	None	gal	None	EUR	€	2.39	regular	11.283541	11.283541	3	0	541	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww	None	10.958502	10.958502	83	0	mph	2	1	Luddenden Foot	GB	England	HX2	False	53.723620	-1.945880	None	2016-11-24 07:47:45	GMT	2016-11-24 07:47:45	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho	[]	52.8	52.8	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
3	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	738.0	Luddenden Foot	GB	England	HX2	False	53.723613	-1.946213	None	2016-11-23 17:55:30	GMT	2016-11-23 17:55:30	None	gal	None	EUR	€	2.39	regular	3.563889	3.563889	0	0	117	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw	None	3.465387	3.465387	26	0	mph	0	0	Halifax	GB	England	HX1 3BA	False	53.714492	-1.878072	None	2016-11-23 17:43:12	GMT	2016-11-23 17:43:12	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U	[]	37.9	37.9	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
4	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	1808.0	Halifax	GB	England	HX1 3BA	False	53.714430	-1.878085	None	2016-11-23 17:32:16	GMT	2016-11-23 17:32:16	None	gal	None	EUR	€	2.39	regular	8.284672	8.284672	0	0	478	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw	None	8.056078	8.056078	58	6	mph	0	1	Cleckheaton	GB	England	BD19 3TY	False	53.731958	-1.717408	None	2016-11-23 17:02:08	GMT	2016-11-23 17:02:08	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE	[]	46.0	46.0	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
5	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	241.0	Cleckheaton	GB	England	BD19 3TY	False	53.732083	-1.717565	None	2016-11-23 13:50:00	GMT	2016-11-23 13:50:00	None	gal	None	EUR	€	2.39	regular	0.914472	0.914472	0	0	62	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww	None	0.917144	0.917144	11	0	mph	0	0	Cleckheaton	GB	England	BD19 3BE	False	53.725010	-1.715388	None	2016-11-23 13:45:59	GMT	2016-11-23 13:45:59	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww/staticmap?t=y0KEAxjFAOE5zvZr7dlueq_gCFo	[]	36.0	36.0	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
6	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	252.0	Cleckheaton	GB	England	BD19 3HU	False	53.725067	-1.715293	None	2016-11-23 13:35:05	GMT	2016-11-23 13:35:05	None	gal	None	EUR	€	2.39	regular	0.918643	0.918643	0	0	85	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw	None	0.907202	0.907202	10	0	mph	0	0	Cleckheaton	GB	England	BD19	False	53.732218	-1.717352	None	2016-11-23 13:30:53	GMT	2016-11-23 13:30:53	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw/staticmap?t=d7G_lFT1s4HDZDck3HKofRbsU3U	[]	35.4	35.4	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
7	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	2366.0	Cleckheaton	GB	England	BD19 3TY	False	53.732067	-1.717568	None	2016-11-23 08:27:53	GMT	2016-11-23 08:27:53	None	gal	None	EUR	€	2.39	regular	11.331648	11.331648	0	1	381	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww	None	10.952910	10.952910	80	0	mph	1	1	Luddenden Foot	GB	England	HX2	False	53.723613	-1.946087	None	2016-11-23 07:48:27	GMT	2016-11-23 07:48:27	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww/staticmap?t=26FKe4-nnlFmtgSLe7BwwqbByWY	[]	50.3	50.3	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
8	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	526.0	Luddenden Foot	GB	England	HX2 6AH	False	53.723627	-1.946223	None	2016-11-22 21:39:30	GMT	2016-11-22 21:39:30	None	gal	None	EUR	€	2.39	regular	3.576469	3.576469	0	0	61	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw	None	3.438047	3.438047	22	0	mph	2	1	Halifax	GB	England	HX1 3BA	False	53.714567	-1.878020	None	2016-11-22 21:30:44	GMT	2016-11-22 21:30:44	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw/staticmap?t=Dk0uV27dMBIVyQDR05r_bayFUFg	[]	44.1	44.1	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN
9	agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM	mi	532.0	Halifax	GB	England	HX1 3BA	False	53.714572	-1.878007	None	2016-11-22 21:25:20	GMT	2016-11-22 21:25:20	None	gal	None	EUR	€	2.39	regular	1.537632	1.537632	0	0	48	agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw	None	1.559642	1.559642	20	0	mph	1	0	Halifax	GB	England	HX1 4TA	False	53.724657	-1.887858	None	2016-11-22 21:16:28	GMT	2016-11-22 21:16:28	https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw/staticmap?t=4GOnC6A4FNpOm4pd_UHYX3FMQDU	[]	32.3	32.3	1	None	agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM	MV13ZFN

	end_point_timestamp	start_point_timestamp	duration_seconds	idle_seconds	start_point_lat	start_point_long	end_point_lat	end_point_long	gps_distance
0	2016-11-24 17:42:54	2016-11-24 17:42:11	43.0	43	51.433437	-1.075727	51.433443	-1.075727	0.000414
1	2016-11-23 10:00:57	2016-11-23 10:00:28	29.0	29	53.302647	-3.542960	53.302728	-3.542902	0.000721
2	2016-10-29 11:26:59	2016-10-29 11:26:34	25.0	25	51.136675	-0.032290	51.136668	-0.032273	0.000455
3	2016-11-10 10:42:34	2016-11-10 10:42:12	22.0	22	50.811335	-0.772648	50.811342	-0.772757	0.004693
4	2016-11-10 20:54:07	2016-11-10 20:53:30	37.0	37	53.859813	-1.511138	53.859795	-1.511058	0.003533
5	2016-09-16 12:38:17	2016-09-16 12:34:30	227.0	227	55.958752	-3.123000	55.958738	-3.123000	0.000574
6	2016-10-23 11:32:24	2016-10-23 11:31:49	35.0	35	53.037577	-2.169860	53.037527	-2.169688	0.009418
7	2016-11-30 19:08:07	2016-11-30 19:07:33	34.0	34	51.182787	-0.025903	51.182825	-0.025842	0.003424
8	2016-11-27 21:02:36	2016-11-27 21:01:05	91.0	91	51.182827	-0.025885	51.182792	-0.025848	0.001896
9	2016-09-29 17:43:42	2016-09-29 17:41:21	141.0	141	51.147028	-0.976982	51.147022	-0.976948	0.001428
10	2016-10-27 22:27:13	2016-10-27 22:26:55	18.0	18	53.690805	-2.326915	53.690720	-2.327162	0.012267
11	2016-11-21 17:19:44	2016-11-21 17:19:09	35.0	35	51.637958	-0.468452	51.637945	-0.468437	0.000519
12	2016-10-31 14:22:57	2016-10-31 14:14:34	503.0	503	51.277420	-0.209937	51.277400	-0.209873	0.002766
13	2016-09-18 18:53:03	2016-09-18 18:51:32	91.0	91	53.037508	-2.169713	53.037582	-2.169957	0.010673
14	2016-09-18 09:17:55	2016-09-18 09:16:39	76.0	76	53.037603	-2.169882	53.037527	-2.169740	0.008271
15	2016-11-13 10:16:33	2016-11-13 10:15:56	37.0	37	53.859902	-1.511110	53.859818	-1.511037	0.007509
16	2016-11-03 11:40:35	2016-11-03 11:36:04	271.0	531488141	53.786428	-1.543668	53.786455	-1.543710	0.840600
17	2016-11-11 12:45:16	2016-11-11 12:44:39	37.0	37	54.565188	-5.979235	54.565170	-5.979213	0.000080
18	2016-11-15 19:40:34	2016-11-15 19:39:51	43.0	43	51.536697	-1.024128	51.536788	-1.024143	0.005744
19	2016-09-24 06:31:02	2016-09-24 06:29:01	121.0	121	51.107595	-1.045262	51.107607	-1.045183	0.004769
20	2016-10-29 12:14:50	2016-10-29 12:00:22	868.0	868	55.958568	-3.122465	55.958533	-3.122478	0.003131
21	2016-10-29 15:44:01	2016-10-29 15:43:36	25.0	25	52.779013	-0.175273	52.778958	-0.175177	0.001937
22	2016-10-19 13:02:40	2016-10-19 13:01:14	86.0	86	51.107705	-1.044980	51.107572	-1.045165	0.014811