Title

done on spark DF to do as if large data set mainly DataFrame API, then SQL API Pandas DataFrame (eager transformation) Check when to do cache()


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
%config InlineBackend.figure_format = 'retina'

In [6]:
import sys, platform
platform.python_version()


Out[6]:
'3.5.2'

In [7]:
import numpy as np

In [8]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [9]:
from pyspark import SparkContext, SQLContext, SparkConf
conf = SparkConf().setMaster("local").set("spark.driver.memory", "1g").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

## spark #spark object entry point Spark 2.0, handle to a spark session
sc
sqlContext


Out[9]:

SparkContext

Spark UI

Version
v2.2.0
Master
local
AppName
pyspark-shell
Out[9]:
<pyspark.sql.context.SQLContext at 0x112e72208>

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [11]:
spark.version


Out[11]:
'2.2.0'

In [ ]:


In [12]:
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")


Out[12]:
'200'
Out[12]:
'3'

In [13]:
data_path = 'Data/zubie_trips_anonymous/'

In [14]:
hadoop = spark._jvm.org.apache.hadoop
fs   = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration() 
path = hadoop.fs.Path(data_path)

for f in fs.get(conf).listStatus(path):
    name =  str(f.getPath())
    if name.endswith('.parquet'):
        print ("{}, rows: {} ".format( name.split('/')[-1], sqlContext.read.parquet(name).count() ))


part-00000-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 8530 
part-00001-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 4922 
part-00002-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 3115 
part-00003-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 2202 
part-00004-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1701 
part-00005-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 2508 
part-00006-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1803 
part-00007-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1357 
part-00008-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 1102 
part-00009-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 856 
part-00010-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 534 
part-00011-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 281 
part-00012-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 41 
part-00013-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00014-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00015-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00016-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00017-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00018-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00019-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00020-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00021-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00022-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00023-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00024-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00025-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00026-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00027-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00028-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00029-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00030-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00031-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00032-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00033-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00034-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00035-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00036-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 
part-00037-3927ef29-1d8d-4f92-bb39-ac1f52d70583.snappy.parquet, rows: 0 

In [15]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, LongType, FloatType, DoubleType

In [16]:
df = spark.read.parquet(data_path)

print ( "total num of rows: {}".format( df.count() ) )


total num of rows: 28952

In [17]:
print ( "Number of partitions: {}".format(df.rdd.getNumPartitions()) )


Number of partitions: 2

In [18]:
print ( " Data Frame Schema: " )
df.printSchema()


 Data Frame Schema: 
root
 |-- device_key: string (nullable = true)
 |-- distance_um: string (nullable = true)
 |-- duration_seconds: double (nullable = true)
 |-- end_point_address_city: string (nullable = true)
 |-- end_point_address_country: string (nullable = true)
 |-- end_point_address_state: string (nullable = true)
 |-- end_point_address_zipcode: string (nullable = true)
 |-- end_point_daylight_saving_time_flag: boolean (nullable = true)
 |-- end_point_latitude: double (nullable = true)
 |-- end_point_longitude: double (nullable = true)
 |-- end_point_place: string (nullable = true)
 |-- end_point_timestamp: string (nullable = true)
 |-- end_point_timestamp_tz: string (nullable = true)
 |-- end_point_timestamp_utc: string (nullable = true)
 |-- fuel_consumed: string (nullable = true)
 |-- fuel_consumed_um: string (nullable = true)
 |-- fuel_cost: string (nullable = true)
 |-- fuel_cost_currency_code: string (nullable = true)
 |-- fuel_cost_currency_symbol: string (nullable = true)
 |-- fuel_ppg: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- gps_distance: double (nullable = true)
 |-- gps_miles: double (nullable = true)
 |-- hard_accel_count: long (nullable = true)
 |-- hard_brake_count: long (nullable = true)
 |-- idle_seconds: long (nullable = true)
 |-- key: string (nullable = true)
 |-- mpg_combined: string (nullable = true)
 |-- obd_distance: double (nullable = true)
 |-- obd_miles: double (nullable = true)
 |-- points_city_count: long (nullable = true)
 |-- points_hwy_count: long (nullable = true)
 |-- speed_um: string (nullable = true)
 |-- speeding_city_major_count: long (nullable = true)
 |-- speeding_city_minor_count: long (nullable = true)
 |-- speeding_hwy_major_count: long (nullable = true)
 |-- speeding_hwy_minor_count: long (nullable = true)
 |-- start_point_address_city: string (nullable = true)
 |-- start_point_address_country: string (nullable = true)
 |-- start_point_address_state: string (nullable = true)
 |-- start_point_address_zipcode: string (nullable = true)
 |-- start_point_daylight_saving_time_flag: boolean (nullable = true)
 |-- start_point_latitude: double (nullable = true)
 |-- start_point_longitude: double (nullable = true)
 |-- start_point_place: string (nullable = true)
 |-- start_point_timestamp: string (nullable = true)
 |-- start_point_timestamp_tz: string (nullable = true)
 |-- start_point_timestamp_utc: string (nullable = true)
 |-- static_map_url: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- top_speed: double (nullable = true)
 |-- top_speed_mph: double (nullable = true)
 |-- trip_segments: long (nullable = true)
 |-- user: string (nullable = true)
 |-- vehicle_key: string (nullable = true)
 |-- vehicle_nickname: string (nullable = true)


In [19]:
df.limit(10).toPandas()


Out[19]:
device_key distance_um duration_seconds end_point_address_city end_point_address_country end_point_address_state end_point_address_zipcode end_point_daylight_saving_time_flag end_point_latitude end_point_longitude end_point_place end_point_timestamp end_point_timestamp_tz end_point_timestamp_utc fuel_consumed fuel_consumed_um fuel_cost fuel_cost_currency_code fuel_cost_currency_symbol fuel_ppg fuel_type gps_distance gps_miles hard_accel_count hard_brake_count idle_seconds key mpg_combined obd_distance obd_miles points_city_count points_hwy_count speed_um speeding_city_major_count speeding_city_minor_count speeding_hwy_major_count speeding_hwy_minor_count start_point_address_city start_point_address_country start_point_address_state start_point_address_zipcode start_point_daylight_saving_time_flag start_point_latitude start_point_longitude start_point_place start_point_timestamp start_point_timestamp_tz start_point_timestamp_utc static_map_url tags top_speed top_speed_mph trip_segments user vehicle_key vehicle_nickname
0 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 425.0 Cleckheaton GB England BD19 3TY False 53.732082 -1.717613 None 2016-11-24 12:33:01 GMT 2016-11-24 12:33:01 None gal None EUR 2.39 regular 0.747816 0.747816 0 0 238 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww None 0.781064 0.781064 12 0 mph 0 0 0 0 Cleckheaton GB England BD19 3BE False 53.724965 -1.715545 None 2016-11-24 12:25:56 GMT 2016-11-24 12:25:56 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48 [] 32.3 32.3 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
1 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 235.0 Cleckheaton GB England BD19 3BJ False 53.725047 -1.715698 None 2016-11-24 12:12:05 GMT 2016-11-24 12:12:05 None gal None EUR 2.39 regular 0.874673 0.874673 0 0 25 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw None 0.884833 0.884833 12 0 mph 0 0 0 0 Cleckheaton GB England BD19 3TY False 53.732043 -1.717593 None 2016-11-24 12:08:10 GMT 2016-11-24 12:08:10 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs [] 27.3 27.3 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
2 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 2439.0 Cleckheaton GB England BD19 3TY False 53.732043 -1.717585 None 2016-11-24 08:28:24 GMT 2016-11-24 08:28:24 None gal None EUR 2.39 regular 11.283541 11.283541 3 0 541 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww None 10.958502 10.958502 83 0 mph 2 1 0 0 Luddenden Foot GB England HX2 False 53.723620 -1.945880 None 2016-11-24 07:47:45 GMT 2016-11-24 07:47:45 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho [] 52.8 52.8 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
3 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 738.0 Luddenden Foot GB England HX2 False 53.723613 -1.946213 None 2016-11-23 17:55:30 GMT 2016-11-23 17:55:30 None gal None EUR 2.39 regular 3.563889 3.563889 0 0 117 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw None 3.465387 3.465387 26 0 mph 0 0 0 0 Halifax GB England HX1 3BA False 53.714492 -1.878072 None 2016-11-23 17:43:12 GMT 2016-11-23 17:43:12 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U [] 37.9 37.9 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
4 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 1808.0 Halifax GB England HX1 3BA False 53.714430 -1.878085 None 2016-11-23 17:32:16 GMT 2016-11-23 17:32:16 None gal None EUR 2.39 regular 8.284672 8.284672 0 0 478 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw None 8.056078 8.056078 58 6 mph 0 1 0 0 Cleckheaton GB England BD19 3TY False 53.731958 -1.717408 None 2016-11-23 17:02:08 GMT 2016-11-23 17:02:08 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE [] 46.0 46.0 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
5 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 241.0 Cleckheaton GB England BD19 3TY False 53.732083 -1.717565 None 2016-11-23 13:50:00 GMT 2016-11-23 13:50:00 None gal None EUR 2.39 regular 0.914472 0.914472 0 0 62 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww None 0.917144 0.917144 11 0 mph 0 0 0 0 Cleckheaton GB England BD19 3BE False 53.725010 -1.715388 None 2016-11-23 13:45:59 GMT 2016-11-23 13:45:59 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww/staticmap?t=y0KEAxjFAOE5zvZr7dlueq_gCFo [] 36.0 36.0 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
6 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 252.0 Cleckheaton GB England BD19 3HU False 53.725067 -1.715293 None 2016-11-23 13:35:05 GMT 2016-11-23 13:35:05 None gal None EUR 2.39 regular 0.918643 0.918643 0 0 85 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw None 0.907202 0.907202 10 0 mph 0 0 0 0 Cleckheaton GB England BD19 False 53.732218 -1.717352 None 2016-11-23 13:30:53 GMT 2016-11-23 13:30:53 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw/staticmap?t=d7G_lFT1s4HDZDck3HKofRbsU3U [] 35.4 35.4 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
7 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 2366.0 Cleckheaton GB England BD19 3TY False 53.732067 -1.717568 None 2016-11-23 08:27:53 GMT 2016-11-23 08:27:53 None gal None EUR 2.39 regular 11.331648 11.331648 0 1 381 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww None 10.952910 10.952910 80 0 mph 1 1 0 0 Luddenden Foot GB England HX2 False 53.723613 -1.946087 None 2016-11-23 07:48:27 GMT 2016-11-23 07:48:27 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww/staticmap?t=26FKe4-nnlFmtgSLe7BwwqbByWY [] 50.3 50.3 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
8 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 526.0 Luddenden Foot GB England HX2 6AH False 53.723627 -1.946223 None 2016-11-22 21:39:30 GMT 2016-11-22 21:39:30 None gal None EUR 2.39 regular 3.576469 3.576469 0 0 61 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw None 3.438047 3.438047 22 0 mph 2 1 0 0 Halifax GB England HX1 3BA False 53.714567 -1.878020 None 2016-11-22 21:30:44 GMT 2016-11-22 21:30:44 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw/staticmap?t=Dk0uV27dMBIVyQDR05r_bayFUFg [] 44.1 44.1 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
9 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 532.0 Halifax GB England HX1 3BA False 53.714572 -1.878007 None 2016-11-22 21:25:20 GMT 2016-11-22 21:25:20 None gal None EUR 2.39 regular 1.537632 1.537632 0 0 48 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw None 1.559642 1.559642 20 0 mph 1 0 0 0 Halifax GB England HX1 4TA False 53.724657 -1.887858 None 2016-11-22 21:16:28 GMT 2016-11-22 21:16:28 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw/staticmap?t=4GOnC6A4FNpOm4pd_UHYX3FMQDU [] 32.3 32.3 1 None agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN

In [20]:
cols = ['user','start_point_place','end_point_place','mpg_combined','fuel_consumed','fuel_cost']

for col in cols:
  print ( "\'{}\' non NULL entries: {}".format(col,df.select(col).dropna().count()) )

print ( "\'tags\' non NULL entries: {}".format(df.select(F.explode('tags')).dropna().count())  )

  
cols.append('tags')

from functools import reduce
from pyspark.sql import DataFrame

df = reduce(DataFrame.drop, cols, df)

print ("\n Columns dropped ")


'user' non NULL entries: 0
'start_point_place' non NULL entries: 0
'end_point_place' non NULL entries: 0
'mpg_combined' non NULL entries: 0
'fuel_consumed' non NULL entries: 0
'fuel_cost' non NULL entries: 0
'tags' non NULL entries: 0

 Columns dropped 

In [21]:
df.select('static_map_url').limit(5).toPandas() #show(5,truncate=False)

import urllib, requests #check first 10 urls
for url in [ c['static_map_url'] for c in df.limit(10).rdd.collect() ]:
  print (url)
  print (' satus code {}\n'.format( requests.get(url).status_code ))
  
df = df.drop('static_map_url')


Out[21]:
static_map_url
0 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48
1 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs
2 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho
3 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U
4 https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE
https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww/staticmap?t=QTd3VNpBsUxZ7dW2GPUkOycYN48
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw/staticmap?t=PIamI1nMri8X7alsbJIJYBfyExs
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww/staticmap?t=FEn4Y1GG7dmU6Mad56rY5IU9Nho
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWOEViRzVwb0FDc21vSzdabkpQTTZraAw/staticmap?t=MWYD3chkvl1AqUxUwsrviEnxn0U
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWZlNyRFpVR1ZBbXBmVXNlR3pqcVl3Sgw/staticmap?t=aesK_aPLG_aBH_B-3RXHFUFiYnE
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWRE14bjNIYUtIUlVrRXlTVHRjeW9mZww/staticmap?t=y0KEAxjFAOE5zvZr7dlueq_gCFo
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNGd1UmpBYmtEcGVVeFU1Q1pFUll4VQw/staticmap?t=d7G_lFT1s4HDZDck3HKofRbsU3U
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWR3BTemRLdXc3bjJKR1Z1ZHhYQ3pRQww/staticmap?t=26FKe4-nnlFmtgSLe7BwwqbByWY
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw/staticmap?t=Dk0uV27dMBIVyQDR05r_bayFUFg
 satus code 401

https://zubie-eu.appspot.com/api/v1/trips/agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWaVM3QkFEV1FYNGJINDI4d05lV1ptaQw/staticmap?t=4GOnC6A4FNpOm4pd_UHYX3FMQDU
 satus code 401


In [22]:
pdf = df.toPandas()
df0 = df.dropna(how='any')

pdf = df0.toPandas()
pdf.shape
df0 = df0.dropDuplicates()
pdf.shape
pdf.drop_duplicates(inplace=True)
pdf.shape
df0.count()

df0.select('key','vehicle_nickname').groupBy('vehicle_nickname','key').count().count()


Out[22]:
(21352, 48)
Out[22]:
(21352, 48)
Out[22]:
(3903, 48)
Out[22]:
3903
Out[22]:
3903

In [23]:
df1 = df.dropDuplicates()
df2 = df.dropna(how='any')
df.count()
df1.count()
df2.count()
df1.dropna(how='any').count()
df2.dropDuplicates().count()


Out[23]:
28952
Out[23]:
5295
Out[23]:
21352
Out[23]:
3903
Out[23]:
3903

In [24]:
dfa = df.dropna(how='any')
dfa.count()


Out[24]:
21352

In [25]:
df = df.dropDuplicates()
df.count()


Out[25]:
5295

In [26]:
df.limit(5).toPandas()


Out[26]:
device_key distance_um duration_seconds end_point_address_city end_point_address_country end_point_address_state end_point_address_zipcode end_point_daylight_saving_time_flag end_point_latitude end_point_longitude end_point_timestamp end_point_timestamp_tz end_point_timestamp_utc fuel_consumed_um fuel_cost_currency_code fuel_cost_currency_symbol fuel_ppg fuel_type gps_distance gps_miles hard_accel_count hard_brake_count idle_seconds key obd_distance obd_miles points_city_count points_hwy_count speed_um speeding_city_major_count speeding_city_minor_count speeding_hwy_major_count speeding_hwy_minor_count start_point_address_city start_point_address_country start_point_address_state start_point_address_zipcode start_point_daylight_saving_time_flag start_point_latitude start_point_longitude start_point_timestamp start_point_timestamp_tz start_point_timestamp_utc top_speed top_speed_mph trip_segments vehicle_key vehicle_nickname
0 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 425.0 Cleckheaton GB England BD19 3TY False 53.732082 -1.717613 2016-11-24 12:33:01 GMT 2016-11-24 12:33:01 gal EUR 2.39 regular 0.747816 0.747816 0 0 238 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww 0.781064 0.781064 12 0 mph 0 0 0 0 Cleckheaton GB England BD19 3BE False 53.724965 -1.715545 2016-11-24 12:25:56 GMT 2016-11-24 12:25:56 32.3 32.3 1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
1 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 235.0 Cleckheaton GB England BD19 3BJ False 53.725047 -1.715698 2016-11-24 12:12:05 GMT 2016-11-24 12:12:05 gal EUR 2.39 regular 0.874673 0.874673 0 0 25 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw 0.884833 0.884833 12 0 mph 0 0 0 0 Cleckheaton GB England BD19 3TY False 53.732043 -1.717593 2016-11-24 12:08:10 GMT 2016-11-24 12:08:10 27.3 27.3 1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
2 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 2439.0 Cleckheaton GB England BD19 3TY False 53.732043 -1.717585 2016-11-24 08:28:24 GMT 2016-11-24 08:28:24 gal EUR 2.39 regular 11.283541 11.283541 3 0 541 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww 10.958502 10.958502 83 0 mph 2 1 0 0 Luddenden Foot GB England HX2 False 53.723620 -1.945880 2016-11-24 07:47:45 GMT 2016-11-24 07:47:45 52.8 52.8 1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
3 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 526.0 Luddenden Foot GB England HX2 6AH False 53.723627 -1.946223 2016-11-22 21:39:30 GMT 2016-11-22 21:39:30 gal EUR 2.39 regular 3.576469 3.576469 0 0 61 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw 3.438047 3.438047 22 0 mph 2 1 0 0 Halifax GB England HX1 3BA False 53.714567 -1.878020 2016-11-22 21:30:44 GMT 2016-11-22 21:30:44 44.1 44.1 1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
4 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 593.0 Halifax GB England HX1 4TA False 53.724555 -1.887878 2016-11-22 19:52:22 GMT 2016-11-22 19:52:22 gal EUR 2.39 regular 4.350397 4.350397 0 0 35 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw 4.279383 4.279383 30 0 mph 1 1 0 0 Luddenden Foot GB England HX2 6AH False 53.723640 -1.946238 2016-11-22 19:42:29 GMT 2016-11-22 19:42:29 46.0 46.0 1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN

check that all missing value cut almost 25% so go check if there is a particular columns contributing more


In [27]:
numOfRows_all = df.count()

for c in df.columns:
  n = 'dummy' #df.where( F.col(c).isNull() ).count()
  print ("\'{}\' -- null counts: {}".format(c,n))

df = df.dropna(how='any')

numOfRows = df.count()

print ("\n Counts before row drop: {}\n".format(numOfRows_all))
print (" Counts after \"any null\" row drop: {0:.0f}, fraction: {1:.2f}\n".format(numOfRows,numOfRows*1./numOfRows_all))


'device_key' -- null counts: dummy
'distance_um' -- null counts: dummy
'duration_seconds' -- null counts: dummy
'end_point_address_city' -- null counts: dummy
'end_point_address_country' -- null counts: dummy
'end_point_address_state' -- null counts: dummy
'end_point_address_zipcode' -- null counts: dummy
'end_point_daylight_saving_time_flag' -- null counts: dummy
'end_point_latitude' -- null counts: dummy
'end_point_longitude' -- null counts: dummy
'end_point_timestamp' -- null counts: dummy
'end_point_timestamp_tz' -- null counts: dummy
'end_point_timestamp_utc' -- null counts: dummy
'fuel_consumed_um' -- null counts: dummy
'fuel_cost_currency_code' -- null counts: dummy
'fuel_cost_currency_symbol' -- null counts: dummy
'fuel_ppg' -- null counts: dummy
'fuel_type' -- null counts: dummy
'gps_distance' -- null counts: dummy
'gps_miles' -- null counts: dummy
'hard_accel_count' -- null counts: dummy
'hard_brake_count' -- null counts: dummy
'idle_seconds' -- null counts: dummy
'key' -- null counts: dummy
'obd_distance' -- null counts: dummy
'obd_miles' -- null counts: dummy
'points_city_count' -- null counts: dummy
'points_hwy_count' -- null counts: dummy
'speed_um' -- null counts: dummy
'speeding_city_major_count' -- null counts: dummy
'speeding_city_minor_count' -- null counts: dummy
'speeding_hwy_major_count' -- null counts: dummy
'speeding_hwy_minor_count' -- null counts: dummy
'start_point_address_city' -- null counts: dummy
'start_point_address_country' -- null counts: dummy
'start_point_address_state' -- null counts: dummy
'start_point_address_zipcode' -- null counts: dummy
'start_point_daylight_saving_time_flag' -- null counts: dummy
'start_point_latitude' -- null counts: dummy
'start_point_longitude' -- null counts: dummy
'start_point_timestamp' -- null counts: dummy
'start_point_timestamp_tz' -- null counts: dummy
'start_point_timestamp_utc' -- null counts: dummy
'top_speed' -- null counts: dummy
'top_speed_mph' -- null counts: dummy
'trip_segments' -- null counts: dummy
'vehicle_key' -- null counts: dummy
'vehicle_nickname' -- null counts: dummy

 Counts before row drop: 5295

 Counts after "any null" row drop: 3903, fraction: 0.74


In [28]:
df2 = df
df2.count()


Out[28]:
3903

In [29]:
df.select('distance_um').distinct().show()
df.select('speed_um').distinct().show()
df.select('fuel_consumed_um').distinct().show()
df.select('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg').distinct().show()

df = df.drop('fuel_cost_currency_code','fuel_cost_currency_symbol','fuel_ppg','fuel_consumed_um')


+-----------+
|distance_um|
+-----------+
|         mi|
+-----------+

+--------+
|speed_um|
+--------+
|     mph|
+--------+

+----------------+
|fuel_consumed_um|
+----------------+
|             gal|
+----------------+

+-----------------------+-------------------------+--------+
|fuel_cost_currency_code|fuel_cost_currency_symbol|fuel_ppg|
+-----------------------+-------------------------+--------+
|                    EUR|                        €|    2.39|
|                    USD|                        $|    3.54|
+-----------------------+-------------------------+--------+


In [30]:
df.select('top_speed','top_speed_mph').describe().show()
df.select('obd_distance','obd_miles').describe().show()
df.select('gps_distance','gps_miles').describe().show()

df = df.drop('top_speed_mph','obd_miles','gps_miles')

## use np.allclose no for large arrays


+-------+------------------+------------------+
|summary|         top_speed|     top_speed_mph|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean| 45.53637113548447| 45.53637113548447|
| stddev|20.040925556040115|20.040925556040115|
|    min|               0.0|               0.0|
|    max|127.38109440865345|127.38109440865345|
+-------+------------------+------------------+

+-------+------------------+------------------+
|summary|      obd_distance|         obd_miles|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean| 8.095386916834075| 8.095386916834075|
| stddev|16.386015432771394|16.386015432771394|
|    min|               0.0|               0.0|
|    max|232.86755348763222|232.86755348763222|
+-------+------------------+------------------+

+-------+--------------------+--------------------+
|summary|        gps_distance|           gps_miles|
+-------+--------------------+--------------------+
|  count|                3903|                3903|
|   mean|   7.257100007399007|   7.257100007399007|
| stddev|  15.855331245522413|  15.855331245522413|
|    min|8.006704572768198E-5|8.006704572768198E-5|
|    max|  237.73788151299107|  237.73788151299107|
+-------+--------------------+--------------------+


In [31]:
df.select('trip_segments').distinct().show() 
df.select('fuel_type').distinct().show()


+-------------+
|trip_segments|
+-------------+
|            1|
+-------------+

+---------+
|fuel_type|
+---------+
|  regular|
+---------+


In [32]:
df = df.drop('trip_segments','fuel_type')

In [33]:
df.select('end_point_address_country').distinct().show()
df.select('start_point_address_country').distinct().show()

df.select('start_point_address_state').distinct().show() #Cymru is the Welsh name of Wales
df.select('end_point_address_state').distinct().show()

df.select('start_point_address_state','end_point_address_state').distinct().show()

df = df.drop('start_point_address_country','end_point_address_country')


+-------------------------+
|end_point_address_country|
+-------------------------+
|                       GB|
+-------------------------+

+---------------------------+
|start_point_address_country|
+---------------------------+
|                         GB|
+---------------------------+

+-------------------------+
|start_point_address_state|
+-------------------------+
|                  England|
|                 Scotland|
|                    Wales|
|                    Cymru|
|         Northern Ireland|
+-------------------------+

+-----------------------+
|end_point_address_state|
+-----------------------+
|                England|
|               Scotland|
|                  Wales|
|       Northern Ireland|
+-----------------------+

+-------------------------+-----------------------+
|start_point_address_state|end_point_address_state|
+-------------------------+-----------------------+
|                    Wales|                  Wales|
|                  England|                England|
|                 Scotland|               Scotland|
|                    Cymru|                  Wales|
|         Northern Ireland|       Northern Ireland|
|                    Wales|                England|
|                  England|                  Wales|
+-------------------------+-----------------------+


In [34]:
print (" Num of records with with state adress = Cymru: {} ".format( df.filter(df['start_point_address_state']=='Cymru').count() ))


 Num of records with with state adress = Cymru: 1 

In [35]:
df.filter(df['start_point_address_state']=='Cymru').toPandas().head()


Out[35]:
device_key distance_um duration_seconds end_point_address_city end_point_address_state end_point_address_zipcode end_point_daylight_saving_time_flag end_point_latitude end_point_longitude end_point_timestamp end_point_timestamp_tz end_point_timestamp_utc gps_distance hard_accel_count hard_brake_count idle_seconds key obd_distance points_city_count points_hwy_count speed_um speeding_city_major_count speeding_city_minor_count speeding_hwy_major_count speeding_hwy_minor_count start_point_address_city start_point_address_state start_point_address_zipcode start_point_daylight_saving_time_flag start_point_latitude start_point_longitude start_point_timestamp start_point_timestamp_tz start_point_timestamp_utc top_speed vehicle_key vehicle_nickname
0 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk5MTjJrdFZrQUNpZEJLZ3FHbWFFaGUM mi 1661.0 Bridgend Wales CF32 9SU False 51.531565 -3.579452 2016-11-21 12:45:18 GMT 2016-11-21 12:45:18 21.130669 0 0 51 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk5MTjJrdFZrQUNpZEJLZ3FHbWFFaGUMCxIEVHJpcCIWQ2dxNWZ2WjNLR1hTYWoyU282Y2tDegw 21.094309 54 1 mph 16 5 0 0 Caerphilly Cymru CF83 False 51.5678 -3.231417 2016-11-21 12:17:37 GMT 2016-11-21 12:17:37 72.7 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZKblAyYmE0cTM5OVAzSGNhbTRiUTdCDAsSA0NhciILNW9xNHV5UHJIUHgM KS12VRN

In [36]:
df = df.replace('Cymru','Wales')

In [37]:
df.limit(5).toPandas().head()


Out[37]:
device_key distance_um duration_seconds end_point_address_city end_point_address_state end_point_address_zipcode end_point_daylight_saving_time_flag end_point_latitude end_point_longitude end_point_timestamp end_point_timestamp_tz end_point_timestamp_utc gps_distance hard_accel_count hard_brake_count idle_seconds key obd_distance points_city_count points_hwy_count speed_um speeding_city_major_count speeding_city_minor_count speeding_hwy_major_count speeding_hwy_minor_count start_point_address_city start_point_address_state start_point_address_zipcode start_point_daylight_saving_time_flag start_point_latitude start_point_longitude start_point_timestamp start_point_timestamp_tz start_point_timestamp_utc top_speed vehicle_key vehicle_nickname
0 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 425.0 Cleckheaton England BD19 3TY False 53.732082 -1.717613 2016-11-24 12:33:01 GMT 2016-11-24 12:33:01 0.747816 0 0 238 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww 0.781064 12 0 mph 0 0 0 0 Cleckheaton England BD19 3BE False 53.724965 -1.715545 2016-11-24 12:25:56 GMT 2016-11-24 12:25:56 32.3 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
1 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 235.0 Cleckheaton England BD19 3BJ False 53.725047 -1.715698 2016-11-24 12:12:05 GMT 2016-11-24 12:12:05 0.874673 0 0 25 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw 0.884833 12 0 mph 0 0 0 0 Cleckheaton England BD19 3TY False 53.732043 -1.717593 2016-11-24 12:08:10 GMT 2016-11-24 12:08:10 27.3 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
2 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 2439.0 Cleckheaton England BD19 3TY False 53.732043 -1.717585 2016-11-24 08:28:24 GMT 2016-11-24 08:28:24 11.283541 3 0 541 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww 10.958502 83 0 mph 2 1 0 0 Luddenden Foot England HX2 False 53.723620 -1.945880 2016-11-24 07:47:45 GMT 2016-11-24 07:47:45 52.8 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
3 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 526.0 Luddenden Foot England HX2 6AH False 53.723627 -1.946223 2016-11-22 21:39:30 GMT 2016-11-22 21:39:30 3.576469 0 0 61 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw 3.438047 22 0 mph 2 1 0 0 Halifax England HX1 3BA False 53.714567 -1.878020 2016-11-22 21:30:44 GMT 2016-11-22 21:30:44 44.1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
4 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 593.0 Halifax England HX1 4TA False 53.724555 -1.887878 2016-11-22 19:52:22 GMT 2016-11-22 19:52:22 4.350397 0 0 35 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw 4.279383 30 0 mph 1 1 0 0 Luddenden Foot England HX2 6AH False 53.723640 -1.946238 2016-11-22 19:42:29 GMT 2016-11-22 19:42:29 46.0 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN

In [38]:
df.groupBy('start_point_timestamp_tz').pivot('end_point_timestamp_tz').count().show()

df.groupBy('start_point_timestamp_tz').pivot('start_point_daylight_saving_time_flag').count().show()
df.groupBy('end_point_timestamp_tz').pivot('end_point_daylight_saving_time_flag').count().show()


df =  df.drop('start_point_daylight_saving_time_flag','end_point_daylight_saving_time_flag')\
        .drop('end_point_timestamp_tz')

df = df.withColumnRenamed('start_point_timestamp_tz','trip_tz')


+------------------------+----+----+
|start_point_timestamp_tz| BST| GMT|
+------------------------+----+----+
|                     GMT|null|3031|
|                     BST| 872|null|
+------------------------+----+----+

+------------------------+-----+----+
|start_point_timestamp_tz|false|true|
+------------------------+-----+----+
|                     GMT| 3031|null|
|                     BST| null| 872|
+------------------------+-----+----+

+----------------------+-----+----+
|end_point_timestamp_tz|false|true|
+----------------------+-----+----+
|                   GMT| 3031|null|
|                   BST| null| 872|
+----------------------+-----+----+


In [39]:
from pyspark.sql.types import TimestampType

def colsToTimestamp(df,colnames,format):
  for name in colnames:
    df = df.withColumn(name,F.unix_timestamp(df[name],timestamp_format).cast(TimestampType()))
  return df
  
timestamp_format = 'yyyy-MM-dd HH:mm:ss'
timestamp_cols   = ['start_point_timestamp','start_point_timestamp_utc','end_point_timestamp','end_point_timestamp_utc']

df = colsToTimestamp(df,timestamp_cols,timestamp_format) 

print (" check types schema: ")
print ([(c.name,c.dataType) for c in df.schema if c.name in timestamp_cols])


 check types schema: 
[('end_point_timestamp', TimestampType), ('end_point_timestamp_utc', TimestampType), ('start_point_timestamp', TimestampType), ('start_point_timestamp_utc', TimestampType)]

In [40]:
df.limit(5).toPandas().head()


Out[40]:
device_key distance_um duration_seconds end_point_address_city end_point_address_state end_point_address_zipcode end_point_latitude end_point_longitude end_point_timestamp end_point_timestamp_utc gps_distance hard_accel_count hard_brake_count idle_seconds key obd_distance points_city_count points_hwy_count speed_um speeding_city_major_count speeding_city_minor_count speeding_hwy_major_count speeding_hwy_minor_count start_point_address_city start_point_address_state start_point_address_zipcode start_point_latitude start_point_longitude start_point_timestamp trip_tz start_point_timestamp_utc top_speed vehicle_key vehicle_nickname
0 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 425.0 Cleckheaton England BD19 3TY 53.732082 -1.717613 2016-11-24 12:33:01 2016-11-24 12:33:01 0.747816 0 0 238 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWUmVUaEpUNlFaRmJhQ1IyREtTTmFSNww 0.781064 12 0 mph 0 0 0 0 Cleckheaton England BD19 3BE 53.724965 -1.715545 2016-11-24 12:25:56 GMT 2016-11-24 12:25:56 32.3 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
1 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 235.0 Cleckheaton England BD19 3BJ 53.725047 -1.715698 2016-11-24 12:12:05 2016-11-24 12:12:05 0.874673 0 0 25 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWazNmMmdEb05oRHJWY3NQSEh5aGdlVgw 0.884833 12 0 mph 0 0 0 0 Cleckheaton England BD19 3TY 53.732043 -1.717593 2016-11-24 12:08:10 GMT 2016-11-24 12:08:10 27.3 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
2 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 2439.0 Cleckheaton England BD19 3TY 53.732043 -1.717585 2016-11-24 08:28:24 2016-11-24 08:28:24 11.283541 3 0 541 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWNFB6V1N5YVhKZFhuOGdlcDNtVVoyVww 10.958502 83 0 mph 2 1 0 0 Luddenden Foot England HX2 53.723620 -1.945880 2016-11-24 07:47:45 GMT 2016-11-24 07:47:45 52.8 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
3 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 526.0 Luddenden Foot England HX2 6AH 53.723627 -1.946223 2016-11-22 21:39:30 2016-11-22 21:39:30 3.576469 0 0 61 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWdGNtVmh4ejhSbmg5NGNScXY3WXR4eQw 3.438047 22 0 mph 2 1 0 0 Halifax England HX1 3BA 53.714567 -1.878020 2016-11-22 21:30:44 GMT 2016-11-22 21:30:44 44.1 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN
4 agplfnp1YmllLWV1ciILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MM mi 593.0 Halifax England HX1 4TA 53.724555 -1.887878 2016-11-22 19:52:22 2016-11-22 19:52:22 4.350397 0 0 35 agplfnp1YmllLWV1ckILEgZEZXZpY2UiFk1aVEdCQ3NMZ1BjRzJOU0JMN1FWV1MMCxIEVHJpcCIWTUhBdUFXZkE3bnA0NmlWdmVqeUxtUgw 4.279383 30 0 mph 1 1 0 0 Luddenden Foot England HX2 6AH 53.723640 -1.946238 2016-11-22 19:42:29 GMT 2016-11-22 19:42:29 46.0 agplfnp1YmllLWV1cjcLEgdBY2NvdW50IhZDSHh3clVuZzlyakNjRldaUUZiMm9TDAsSA0NhciILNENOaVpkM1U5SHEM MV13ZFN

In [41]:
col = 'start_point_timestamp'

df.select(col,'trip_tz' ) \
       .groupBy(F.year(col),F.month(col),F.dayofmonth(col),'trip_tz')\
       .count()\
       .orderBy(F.year(col),F.month(col),F.dayofmonth(col)) \
       .toPandas() #.show(n=365,truncate=False)


+---------------------------+----------------------------+---------------------------------+-------+-----+
|year(start_point_timestamp)|month(start_point_timestamp)|dayofmonth(start_point_timestamp)|trip_tz|count|
+---------------------------+----------------------------+---------------------------------+-------+-----+
|2016                       |8                           |10                               |BST    |1    |
|2016                       |8                           |13                               |BST    |2    |
|2016                       |8                           |14                               |BST    |1    |
|2016                       |8                           |17                               |BST    |2    |
|2016                       |8                           |18                               |BST    |2    |
|2016                       |8                           |19                               |BST    |1    |
|2016                       |8                           |21                               |BST    |2    |
|2016                       |8                           |23                               |BST    |2    |
|2016                       |8                           |27                               |BST    |4    |
|2016                       |8                           |28                               |BST    |2    |
|2016                       |8                           |29                               |BST    |1    |
|2016                       |8                           |30                               |BST    |2    |
|2016                       |9                           |1                                |BST    |5    |
|2016                       |9                           |2                                |BST    |2    |
|2016                       |9                           |3                                |BST    |2    |
|2016                       |9                           |4                                |BST    |5    |
|2016                       |9                           |5                                |BST    |4    |
|2016                       |9                           |6                                |BST    |5    |
|2016                       |9                           |7                                |BST    |4    |
|2016                       |9                           |8                                |BST    |8    |
|2016                       |9                           |9                                |BST    |6    |
|2016                       |9                           |10                               |BST    |3    |
|2016                       |9                           |11                               |BST    |4    |
|2016                       |9                           |12                               |BST    |3    |
|2016                       |9                           |13                               |BST    |5    |
|2016                       |9                           |14                               |BST    |6    |
|2016                       |9                           |15                               |BST    |20   |
|2016                       |9                           |16                               |BST    |42   |
|2016                       |9                           |17                               |BST    |67   |
|2016                       |9                           |18                               |BST    |52   |
|2016                       |9                           |19                               |BST    |38   |
|2016                       |9                           |20                               |BST    |46   |
|2016                       |9                           |21                               |BST    |19   |
|2016                       |9                           |23                               |BST    |2    |
|2016                       |9                           |24                               |BST    |12   |
|2016                       |9                           |25                               |BST    |7    |
|2016                       |9                           |26                               |BST    |4    |
|2016                       |9                           |27                               |BST    |3    |
|2016                       |9                           |29                               |BST    |5    |
|2016                       |9                           |30                               |BST    |1    |
|2016                       |10                          |1                                |BST    |12   |
|2016                       |10                          |2                                |BST    |3    |
|2016                       |10                          |3                                |BST    |5    |
|2016                       |10                          |4                                |BST    |2    |
|2016                       |10                          |5                                |BST    |2    |
|2016                       |10                          |6                                |BST    |10   |
|2016                       |10                          |8                                |BST    |1    |
|2016                       |10                          |11                               |BST    |5    |
|2016                       |10                          |13                               |BST    |4    |
|2016                       |10                          |15                               |BST    |11   |
|2016                       |10                          |16                               |BST    |13   |
|2016                       |10                          |17                               |BST    |14   |
|2016                       |10                          |18                               |BST    |13   |
|2016                       |10                          |19                               |BST    |18   |
|2016                       |10                          |20                               |BST    |5    |
|2016                       |10                          |21                               |BST    |16   |
|2016                       |10                          |22                               |BST    |46   |
|2016                       |10                          |23                               |BST    |32   |
|2016                       |10                          |24                               |BST    |21   |
|2016                       |10                          |25                               |BST    |15   |
|2016                       |10                          |26                               |BST    |28   |
|2016                       |10                          |27                               |BST    |48   |
|2016                       |10                          |28                               |BST    |60   |
|2016                       |10                          |29                               |BST    |86   |
|2016                       |10                          |30                               |GMT    |63   |
|2016                       |10                          |31                               |GMT    |81   |
|2016                       |11                          |1                                |GMT    |60   |
|2016                       |11                          |2                                |GMT    |105  |
|2016                       |11                          |3                                |GMT    |92   |
|2016                       |11                          |4                                |GMT    |77   |
|2016                       |11                          |5                                |GMT    |99   |
|2016                       |11                          |6                                |GMT    |87   |
|2016                       |11                          |7                                |GMT    |79   |
|2016                       |11                          |8                                |GMT    |77   |
|2016                       |11                          |9                                |GMT    |82   |
|2016                       |11                          |10                               |GMT    |79   |
|2016                       |11                          |11                               |GMT    |99   |
|2016                       |11                          |12                               |GMT    |97   |
|2016                       |11                          |13                               |GMT    |81   |
|2016                       |11                          |14                               |GMT    |62   |
|2016                       |11                          |15                               |GMT    |70   |
|2016                       |11                          |16                               |GMT    |89   |
|2016                       |11                          |17                               |GMT    |89   |
|2016                       |11                          |18                               |GMT    |96   |
|2016                       |11                          |19                               |GMT    |92   |
|2016                       |11                          |20                               |GMT    |80   |
|2016                       |11                          |21                               |GMT    |96   |
|2016                       |11                          |22                               |GMT    |61   |
|2016                       |11                          |23                               |GMT    |74   |
|2016                       |11                          |24                               |GMT    |58   |
|2016                       |11                          |25                               |GMT    |65   |
|2016                       |11                          |26                               |GMT    |80   |
|2016                       |11                          |27                               |GMT    |50   |
|2016                       |11                          |28                               |GMT    |65   |
|2016                       |11                          |29                               |GMT    |49   |
|2016                       |11                          |30                               |GMT    |78   |
|2016                       |12                          |1                                |GMT    |65   |
|2016                       |12                          |2                                |GMT    |60   |
|2016                       |12                          |3                                |GMT    |80   |
|2016                       |12                          |4                                |GMT    |55   |
|2016                       |12                          |5                                |GMT    |40   |
|2016                       |12                          |6                                |GMT    |34   |
|2016                       |12                          |7                                |GMT    |40   |
|2016                       |12                          |8                                |GMT    |48   |
|2016                       |12                          |9                                |GMT    |23   |
|2016                       |12                          |10                               |GMT    |34   |
|2016                       |12                          |11                               |GMT    |19   |
|2016                       |12                          |12                               |GMT    |12   |
|2016                       |12                          |13                               |GMT    |9    |
+---------------------------+----------------------------+---------------------------------+-------+-----+


In [42]:
timing_cols = [n for n in df.columns if 'timestamp' in n]

timing_cols.append('trip_tz')
timing_cols.append('duration_seconds')

df.select( [c for c in timing_cols] ) \
  .withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
  .limit(5).toPandas().head() #(.show(5,truncate=False)
  

df.select( [c for c in timing_cols] ) \
  .withColumn('mydt',F.unix_timestamp(df['end_point_timestamp_utc'])-F.unix_timestamp(df['start_point_timestamp_utc'])) \
  .describe('duration_seconds','mydt') \
  .show() 

df = df.drop('start_point_timestamp_utc','end_point_timestamp_utc')


Out[42]:
end_point_timestamp end_point_timestamp_utc start_point_timestamp start_point_timestamp_utc trip_tz duration_seconds mydt
0 2016-11-24 12:33:01 2016-11-24 12:33:01 2016-11-24 12:25:56 2016-11-24 12:25:56 GMT 425.0 425
1 2016-11-24 12:12:05 2016-11-24 12:12:05 2016-11-24 12:08:10 2016-11-24 12:08:10 GMT 235.0 235
2 2016-11-24 08:28:24 2016-11-24 08:28:24 2016-11-24 07:47:45 2016-11-24 07:47:45 GMT 2439.0 2439
3 2016-11-22 21:39:30 2016-11-22 21:39:30 2016-11-22 21:30:44 2016-11-22 21:30:44 GMT 526.0 526
4 2016-11-22 19:52:22 2016-11-22 19:52:22 2016-11-22 19:42:29 2016-11-22 19:42:29 GMT 593.0 593
+-------+------------------+------------------+
|summary|  duration_seconds|              mydt|
+-------+------------------+------------------+
|  count|              3903|              3903|
|   mean|1164.3219336407892| 1164.322316167051|
| stddev|1374.7726984147887|1374.7725450494704|
|    min|              18.0|                18|
|    max|           17805.0|             17805|
+-------+------------------+------------------+


In [43]:
df = df.withColumnRenamed('start_point_latitude','start_point_lat') \
  .withColumnRenamed('start_point_longitude','start_point_long') \
  .withColumnRenamed('end_point_latitude','end_point_lat') \
  .withColumnRenamed('end_point_longitude','end_point_long')

In [44]:
latlong_cols = ['start_point_lat','start_point_long','end_point_lat','end_point_long']

timestamp_cols = [n for n in df.columns if 'timestamp' in n]

distance_cols = ['gps_distance','obd_distance']

duration_cols = ['duration_seconds','idle_seconds']

In [ ]:
cols = timestamp_cols + duration_cols + latlong_cols + ['gps_distance']

print (" number of rows where idle time  >= trip duration: {}"\
 .format( df.where(df['idle_seconds'] >= df['duration_seconds']).count() )
      )
  
print (" number of rows where idle time  < 0: {}" \
 .format( df.where(df['idle_seconds'] < 0).count() )
      )

df.select(cols)\
  .where( df['idle_seconds'] >= df['duration_seconds'] )\
  .toPandas() #.show(truncate=False)

df.select(cols)\
  .where( df['idle_seconds'] < 0 )\
  .toPandas() #.show(truncate=False)
  
df = df.where( (df['duration_seconds'] > df['idle_seconds']) & (df['idle_seconds'] >= 0) )


print ("rows after selection: {}".format(df.count()))


 number of rows where idle time  >= trip duration: 23
 number of rows where idle time  < 0: 1
Out[ ]:
end_point_timestamp start_point_timestamp duration_seconds idle_seconds start_point_lat start_point_long end_point_lat end_point_long gps_distance
0 2016-11-24 17:42:54 2016-11-24 17:42:11 43.0 43 51.433437 -1.075727 51.433443 -1.075727 0.000414
1 2016-11-23 10:00:57 2016-11-23 10:00:28 29.0 29 53.302647 -3.542960 53.302728 -3.542902 0.000721
2 2016-10-29 11:26:59 2016-10-29 11:26:34 25.0 25 51.136675 -0.032290 51.136668 -0.032273 0.000455
3 2016-11-10 10:42:34 2016-11-10 10:42:12 22.0 22 50.811335 -0.772648 50.811342 -0.772757 0.004693
4 2016-11-10 20:54:07 2016-11-10 20:53:30 37.0 37 53.859813 -1.511138 53.859795 -1.511058 0.003533
5 2016-09-16 12:38:17 2016-09-16 12:34:30 227.0 227 55.958752 -3.123000 55.958738 -3.123000 0.000574
6 2016-10-23 11:32:24 2016-10-23 11:31:49 35.0 35 53.037577 -2.169860 53.037527 -2.169688 0.009418
7 2016-11-30 19:08:07 2016-11-30 19:07:33 34.0 34 51.182787 -0.025903 51.182825 -0.025842 0.003424
8 2016-11-27 21:02:36 2016-11-27 21:01:05 91.0 91 51.182827 -0.025885 51.182792 -0.025848 0.001896
9 2016-09-29 17:43:42 2016-09-29 17:41:21 141.0 141 51.147028 -0.976982 51.147022 -0.976948 0.001428
10 2016-10-27 22:27:13 2016-10-27 22:26:55 18.0 18 53.690805 -2.326915 53.690720 -2.327162 0.012267
11 2016-11-21 17:19:44 2016-11-21 17:19:09 35.0 35 51.637958 -0.468452 51.637945 -0.468437 0.000519
12 2016-10-31 14:22:57 2016-10-31 14:14:34 503.0 503 51.277420 -0.209937 51.277400 -0.209873 0.002766
13 2016-09-18 18:53:03 2016-09-18 18:51:32 91.0 91 53.037508 -2.169713 53.037582 -2.169957 0.010673
14 2016-09-18 09:17:55 2016-09-18 09:16:39 76.0 76 53.037603 -2.169882 53.037527 -2.169740 0.008271
15 2016-11-13 10:16:33 2016-11-13 10:15:56 37.0 37 53.859902 -1.511110 53.859818 -1.511037 0.007509
16 2016-11-03 11:40:35 2016-11-03 11:36:04 271.0 531488141 53.786428 -1.543668 53.786455 -1.543710 0.840600
17 2016-11-11 12:45:16 2016-11-11 12:44:39 37.0 37 54.565188 -5.979235 54.565170 -5.979213 0.000080
18 2016-11-15 19:40:34 2016-11-15 19:39:51 43.0 43 51.536697 -1.024128 51.536788 -1.024143 0.005744
19 2016-09-24 06:31:02 2016-09-24 06:29:01 121.0 121 51.107595 -1.045262 51.107607 -1.045183 0.004769
20 2016-10-29 12:14:50 2016-10-29 12:00:22 868.0 868 55.958568 -3.122465 55.958533 -3.122478 0.003131
21 2016-10-29 15:44:01 2016-10-29 15:43:36 25.0 25 52.779013 -0.175273 52.778958 -0.175177 0.001937
22 2016-10-19 13:02:40 2016-10-19 13:01:14 86.0 86 51.107705 -1.044980 51.107572 -1.045165 0.014811
Out[ ]:
end_point_timestamp start_point_timestamp duration_seconds idle_seconds start_point_lat start_point_long end_point_lat end_point_long gps_distance
0 2016-10-31 10:24:47 2016-10-31 09:57:11 1656.0 -3326 51.074132 -1.771637 51.074427 -1.770632 0.105285
rows after selection: 3879

In [ ]:
df2 = df
df2.count()

In [ ]:
df.select(latlong_cols)\
  .describe()\
  .show()

In [ ]:
df.select('top_speed')\
  .describe()\
  .toPandas() #.show()

df.select( [c for c in df.columns if c.startswith('speeding')] )\
  .describe()\
  .show()

df.select( [c for c in df.columns if 'brake' in c ] )\
  .describe()\
  .show()
  
df.select( [c for c in df.columns if 'accel' in c] )\
  .describe()\
  .show()
  

df.select( [c for c in df.columns if c.startswith('points')] )\
  .describe()\
  .show()

In [ ]:
schema_cols = [(c.name,c.dataType) for c in df.schema]

for c in schema_cols:
  name,dtype = c
  if  dtype == DoubleType():
    df = df.withColumn(name,df[name].cast(FloatType()))
  elif dtype == LongType():
    df = df.withColumn(name,df[name].cast(IntegerType()))
    

df = df.withColumn('duration_seconds',df['duration_seconds'].cast(IntegerType()))

df.printSchema()

In [ ]:
df.limit(5).toPandas().head()

In [ ]:
print (" key entries: {}"\
                      .format( df.select('key').distinct().count() )
      )

df = df.drop('key')

In [ ]:
print (" Number of device_key: {}"\
                               .format( df.select('device_key').distinct().count() ) 
      )

In [ ]:
print ("Number of unique vehicle_key, vehicle_nickname: ",\
       df.select('vehicle_key','vehicle_nickname').groupBy('vehicle_key','vehicle_nickname').count().count()
      )

df.select('vehicle_key','vehicle_nickname').groupBy('vehicle_key','vehicle_nickname').count().orderBy('vehicle_nickname').show(n=100,truncate=False)

In [ ]:
print ("Number of unique device_key, vehicle_nickname: ",\
       df.select('device_key','vehicle_nickname').groupBy('device_key','vehicle_nickname').count().count()
      )

df.select('device_key','vehicle_nickname').groupBy('device_key','vehicle_nickname').count().orderBy('vehicle_nickname').show(n=100,truncate=False)

In [ ]:
device_keys = df.select('device_key').distinct().orderBy('device_key').rdd.flatMap(lambda x:x).collect()

getID_udf = F.udf(lambda k: device_keys.index(k), IntegerType())

df = df.withColumn('device_id',getID_udf(df['device_key']))

In [ ]:
vehicle_names = df.select('vehicle_nickname').distinct().orderBy('vehicle_nickname').rdd.flatMap(lambda x:x).collect()

getID_udf = F.udf(lambda k: vehicle_names.index(k), IntegerType())

df = df.withColumn('vehicle_id',getID_udf(df['vehicle_nickname']))

In [ ]:
df = df.drop('device_key')
df = df.drop('vehicle_key')

In [ ]:
df.printSchema()

In [ ]:
for c in [c for c in df.columns if 'address' in c]:
  df = df.withColumnRenamed(c,c.replace('address_',''))

df.columns

In [ ]:
cols = ['device_id','vehicle_id','vehicle_nickname'] \
       + [c for c in df.columns if c.startswith('start')] \
       + [c for c in df.columns if c.startswith('end')] \
       + ['trip_tz'] \
       + ['duration_seconds','idle_seconds'] \
       + ['gps_distance','obd_distance'] \
       + ['top_speed'] \
       + ['hard_accel_count','hard_brake_count'] \
       + ['points_city_count','points_hwy_count'] \
       + [c for c in df.columns if c.startswith('speeding')] \
       + ['speed_um','distance_um']
       
len(cols)
len(df.columns)

In [ ]:
df = df.select(cols)
df = df.orderBy('start_point_timestamp','vehicle_id')
df.count()

In [ ]:
df.limit(50).toPandas()

In [ ]:
df.select('vehicle_nickname').distinct().toPandas()['vehicle_nickname'].values.tolist()

In [ ]:
cols =   ['device_id','vehicle_id']\
       + ['start_point_lat','start_point_long'] \
       + ['end_point_lat','end_point_long'] \
       + ['duration_seconds','idle_seconds'] \
       + ['gps_distance','obd_distance'] \
       + ['top_speed'] \
       + ['hard_accel_count','hard_brake_count'] \
       + ['points_city_count','points_hwy_count'] \
       + ['speeding_city_major_count','speeding_city_minor_count']\
       + ['speeding_hwy_major_count','speeding_hwy_minor_count'] \
       + ['start_point_timestamp','end_point_timestamp']

df_num = df.select(cols)\
         .orderBy('start_point_timestamp','vehicle_id')\
         .withColumn('start_ts',F.unix_timestamp('start_point_timestamp'))\
         .withColumn('start_y',F.year('start_point_timestamp'))\
         .withColumn('start_m',F.month('start_point_timestamp'))\
         .withColumn('start_day',F.dayofmonth('start_point_timestamp'))\
         .withColumn('start_day2',F.dayofyear('start_point_timestamp'))\
         .withColumn('start_hh',F.hour('start_point_timestamp'))\
         .withColumn('start_mm',F.minute('start_point_timestamp'))\
         .withColumn('start_ss',F.second('start_point_timestamp'))\
         .withColumn('end_ts',F.unix_timestamp('end_point_timestamp'))\
         .withColumn('end_y',F.year('end_point_timestamp'))\
         .withColumn('end_m',F.month('end_point_timestamp'))\
         .withColumn('end_day',F.dayofmonth('end_point_timestamp'))\
         .withColumn('end_day2',F.dayofyear('end_point_timestamp'))\
         .withColumn('end_hh',F.hour('end_point_timestamp'))\
         .withColumn('end_mm',F.minute('end_point_timestamp'))\
         .withColumn('end_ss',F.second('end_point_timestamp'))
        
df_num = df_num.drop('start_point_timestamp','end_point_timestamp')

In [ ]:
df_num.limit(50).toPandas()

In [ ]:
df.limit(50).toPandas()

In [ ]:
pdf = df.toPandas()
pdf.to_pickle('zubie.pkl')

In [ ]:
pdf0 = pd.read_pickle('zubie.pkl')
pdf0.dtypes

In [ ]:
! rm -rf ./output
df.repartition(1).write.parquet('./output')

In [ ]: