In [1]:
!wget https://raw.githubusercontent.com/romeokienzler/uhack/master/projects/bsusat/telemetry3.json


Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20190416083213-0000
KERNEL_ID = 02187022-c342-4d76-af8b-a0151123e674
--2019-04-16 08:32:16--  https://raw.githubusercontent.com/romeokienzler/uhack/master/projects/bsusat/telemetry3.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 454538 (444K) [text/plain]
Saving to: 'telemetry3.json'

telemetry3.json     100%[===================>] 443.88K  --.-KB/s    in 0.02s   

2019-04-16 08:32:16 (23.8 MB/s) - 'telemetry3.json' saved [454538/454538]


In [2]:
df_data_1 = spark.read.json('telemetry3.json')
df_data_1.take(5)


Out[2]:
[Row(EPS_can_status='OK', MCUSR='5', OBC_can_status='TIMEOUT', RF_ID='0', battery_voltage='8.35', data_type='3', info_size='43', modem_on_period='10', observer='JA0CAW', opr_time='336', pamp_temp='34.26', pamp_voltage='2.2', pwr_save_state='on', reboot_cnt='48', seq_number='1', system_voltage='8.72', timestamp='2018-11-13T10:49:18.900Z', tx_attenuator='8'),
 Row(EPS_can_status='OK', MCUSR='5', OBC_can_status='TIMEOUT', RF_ID='0', battery_voltage='8.35', data_type='3', info_size='43', modem_on_period='10', observer='JesperLF', opr_time='565', pamp_temp='32.75', pamp_voltage='2.2', pwr_save_state='on', reboot_cnt='46', seq_number='1', system_voltage='8.72', timestamp='2018-11-13T09:33:07.680Z', tx_attenuator='8'),
 Row(EPS_can_status='OK', MCUSR='5', OBC_can_status='TIMEOUT', RF_ID='0', battery_voltage='8.35', data_type='3', info_size='43', modem_on_period='10', observer='JesperLF', opr_time='475', pamp_temp='32.75', pamp_voltage='2.2', pwr_save_state='on', reboot_cnt='46', seq_number='1', system_voltage='8.72', timestamp='2018-11-13T09:31:37.690Z', tx_attenuator='8'),
 Row(EPS_can_status='OK', MCUSR='5', OBC_can_status='TIMEOUT', RF_ID='1', battery_voltage='8.32', data_type='3', info_size='43', modem_on_period='10', observer='JesperLF', opr_time='1014', pamp_temp='34.26', pamp_voltage='2.2', pwr_save_state='on', reboot_cnt='38', seq_number='1', system_voltage='4.98', timestamp='2018-11-13T08:00:36.800Z', tx_attenuator='8'),
 Row(EPS_can_status='OK', MCUSR='5', OBC_can_status='TIMEOUT', RF_ID='1', battery_voltage='8.32', data_type='3', info_size='43', modem_on_period='10', observer='JesperLF', opr_time='966', pamp_temp='34.64', pamp_voltage='2.2', pwr_save_state='on', reboot_cnt='38', seq_number='1', system_voltage='4.98', timestamp='2018-11-13T07:59:48.690Z', tx_attenuator='8')]

In [3]:
df_data_1.count()


Out[3]:
1155

In [4]:
df = df_data_1

In [5]:
df.createOrReplaceTempView('df')

In [6]:
spark.sql('select * from df').show()


+--------------+-----+--------------+-----+---------------+---------+---------+---------------+--------+--------+---------+------------+--------------+----------+----------+--------------+--------------------+-------------+
|EPS_can_status|MCUSR|OBC_can_status|RF_ID|battery_voltage|data_type|info_size|modem_on_period|observer|opr_time|pamp_temp|pamp_voltage|pwr_save_state|reboot_cnt|seq_number|system_voltage|           timestamp|tx_attenuator|
+--------------+-----+--------------+-----+---------------+---------+---------+---------------+--------+--------+---------+------------+--------------+----------+----------+--------------+--------------------+-------------+
|            OK|    5|       TIMEOUT|    0|           8.35|        3|       43|             10|  JA0CAW|     336|    34.26|         2.2|            on|        48|         1|          8.72|2018-11-13T10:49:...|            8|
|            OK|    5|       TIMEOUT|    0|           8.35|        3|       43|             10|JesperLF|     565|    32.75|         2.2|            on|        46|         1|          8.72|2018-11-13T09:33:...|            8|
|            OK|    5|       TIMEOUT|    0|           8.35|        3|       43|             10|JesperLF|     475|    32.75|         2.2|            on|        46|         1|          8.72|2018-11-13T09:31:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.32|        3|       43|             10|JesperLF|    1014|    34.26|         2.2|            on|        38|         1|          4.98|2018-11-13T08:00:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.32|        3|       43|             10|JesperLF|     966|    34.64|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        1|        3|             10|JesperLF|     932|    35.02|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     927|    34.64|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        1|        3|             10|JesperLF|     926|    34.64|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     924|    34.64|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        1|        3|             10|JesperLF|     923|    34.26|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     921|    34.26|         2.2|            on|        38|         1|          4.98|2018-11-13T07:59:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     882|    34.64|         2.2|            on|        38|         1|          4.98|2018-11-13T07:58:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        1|        3|             10|JesperLF|     881|    34.26|         2.2|            on|        38|         1|          4.98|2018-11-13T07:58:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     878|    33.88|         2.2|            on|        38|         1|          4.98|2018-11-13T07:58:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        1|        3|             10|JesperLF|     876|    33.88|         2.2|            on|        38|         1|          4.98|2018-11-13T07:58:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        3|       43|             10|JesperLF|     876|     33.5|         2.2|            on|        38|         1|          4.98|2018-11-13T07:58:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        3|       43|             10|JesperLF|     741|    33.88|         2.2|            on|        38|         1|          4.98|2018-11-13T07:56:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        3|       43|             10|JesperLF|     696|    33.88|         2.2|            on|        38|         1|          4.98|2018-11-13T07:55:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|      254|       64|             10|JesperLF|     651|     33.5|         2.2|            on|        38|         1|          4.98|2018-11-13T07:54:...|            8|
|            OK|    5|       TIMEOUT|    1|           8.35|        3|       43|             10|  IK8XLD|     468|     35.4|         2.2|            on|        36|         1|          4.98|2018-11-13T06:31:...|            8|
+--------------+-----+--------------+-----+---------------+---------+---------+---------------+--------+--------+---------+------------+--------------+----------+----------+--------------+--------------------+-------------+
only showing top 20 rows


In [7]:
df.printSchema()


root
 |-- EPS_can_status: string (nullable = true)
 |-- MCUSR: string (nullable = true)
 |-- OBC_can_status: string (nullable = true)
 |-- RF_ID: string (nullable = true)
 |-- battery_voltage: string (nullable = true)
 |-- data_type: string (nullable = true)
 |-- info_size: string (nullable = true)
 |-- modem_on_period: string (nullable = true)
 |-- observer: string (nullable = true)
 |-- opr_time: string (nullable = true)
 |-- pamp_temp: string (nullable = true)
 |-- pamp_voltage: string (nullable = true)
 |-- pwr_save_state: string (nullable = true)
 |-- reboot_cnt: string (nullable = true)
 |-- seq_number: string (nullable = true)
 |-- system_voltage: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- tx_attenuator: string (nullable = true)


In [10]:
df_cast = df.select(
        df.timestamp,
        df.battery_voltage.cast("float"),
        df.pamp_voltage.cast("float"),
        df.system_voltage.cast("float"),
        df.pamp_temp.cast("float")
    )

In [11]:
df_cast.printSchema()


root
 |-- timestamp: string (nullable = true)
 |-- battery_voltage: float (nullable = true)
 |-- pamp_voltage: float (nullable = true)
 |-- system_voltage: float (nullable = true)
 |-- pamp_temp: float (nullable = true)


In [12]:
df_cast.show()


+--------------------+---------------+------------+--------------+---------+
|           timestamp|battery_voltage|pamp_voltage|system_voltage|pamp_temp|
+--------------------+---------------+------------+--------------+---------+
|2018-11-13T10:49:...|           8.35|         2.2|          8.72|    34.26|
|2018-11-13T09:33:...|           8.35|         2.2|          8.72|    32.75|
|2018-11-13T09:31:...|           8.35|         2.2|          8.72|    32.75|
|2018-11-13T08:00:...|           8.32|         2.2|          4.98|    34.26|
|2018-11-13T07:59:...|           8.32|         2.2|          4.98|    34.64|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    35.02|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.26|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.26|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    34.64|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    34.26|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    33.88|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    33.88|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|     33.5|
|2018-11-13T07:56:...|           8.35|         2.2|          4.98|    33.88|
|2018-11-13T07:55:...|           8.35|         2.2|          4.98|    33.88|
|2018-11-13T07:54:...|           8.35|         2.2|          4.98|     33.5|
|2018-11-13T06:31:...|           8.35|         2.2|          4.98|     35.4|
+--------------------+---------------+------------+--------------+---------+
only showing top 20 rows


In [13]:
df_cast.createOrReplaceTempView('df_cast')
spark.sql("SELECT *, unix_timestamp(timestamp, 'yyyy-MM-ddTHH:mm:ss.SSSZ') from df_cast").show()


+--------------------+---------------+------------+--------------+---------+---------------------------------------------------+
|           timestamp|battery_voltage|pamp_voltage|system_voltage|pamp_temp|unix_timestamp(timestamp, yyyy-MM-ddTHH:mm:ss.SSSZ)|
+--------------------+---------------+------------+--------------+---------+---------------------------------------------------+
|2018-11-13T10:49:...|           8.35|         2.2|          8.72|    34.26|                                               null|
|2018-11-13T09:33:...|           8.35|         2.2|          8.72|    32.75|                                               null|
|2018-11-13T09:31:...|           8.35|         2.2|          8.72|    32.75|                                               null|
|2018-11-13T08:00:...|           8.32|         2.2|          4.98|    34.26|                                               null|
|2018-11-13T07:59:...|           8.32|         2.2|          4.98|    34.64|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    35.02|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.64|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.26|                                               null|
|2018-11-13T07:59:...|           8.35|         2.2|          4.98|    34.26|                                               null|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    34.64|                                               null|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    34.26|                                               null|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    33.88|                                               null|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|    33.88|                                               null|
|2018-11-13T07:58:...|           8.35|         2.2|          4.98|     33.5|                                               null|
|2018-11-13T07:56:...|           8.35|         2.2|          4.98|    33.88|                                               null|
|2018-11-13T07:55:...|           8.35|         2.2|          4.98|    33.88|                                               null|
|2018-11-13T07:54:...|           8.35|         2.2|          4.98|     33.5|                                               null|
|2018-11-13T06:31:...|           8.35|         2.2|          4.98|     35.4|                                               null|
+--------------------+---------------+------------+--------------+---------+---------------------------------------------------+
only showing top 20 rows


In [14]:
df_cast.withColumn("ts", unix_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(TimestampType))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-81171a6439c3> in <module>()
----> 1 df_cast.withColumn("ts", unix_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(TimestampType))

NameError: name 'unix_timestamp' is not defined

In [15]:
from pyspark.sql.functions import from_unixtime
from pyspark.sql.types import DateType
df_cast_ts = df_cast.withColumn('ts', from_unixtime('timestamp').cast(DateType()))

In [16]:
df_cast_ts_ordered = df_cast_ts.orderBy('timestamp')

In [17]:
import pixiedust
display(df_cast_ts_ordered)


Hey, there's something awesome here! To see it, open this notebook outside GitHub, in a viewer like Jupyter

In [18]:
df_cast_ts_ordered.show()


+--------------------+---------------+------------+--------------+---------+----+
|           timestamp|battery_voltage|pamp_voltage|system_voltage|pamp_temp|  ts|
+--------------------+---------------+------------+--------------+---------+----+
|                    |           8.35|         2.2|         510.0|     35.4|null|
|                    |           8.35|         2.2|         510.0|    34.64|null|
|                    |       8.342743|         2.2|         510.0|35.017563|null|
|                    |           8.35|         2.2|         510.0|    35.02|null|
|                    |           8.35|         2.2|         510.0|    35.02|null|
|                    |       8.345032|         2.2|         510.0|35.017563|null|
|                    |           8.35|         2.2|         510.0|    35.02|null|
|                    |           8.34|         2.2|         510.0|    34.64|null|
|                    |           8.35|         2.2|         510.0|   35.018|null|
|2018-10-29 16:14:...|           8.35|         2.2|         891.0|    33.88|null|
|2018-10-29 21:24:...|           8.35|         2.2|         890.0|    33.88|null|
|2018-10-29 21:26:...|           8.35|         2.2|         891.0|    34.64|null|
|2018-10-29 21:27:...|           8.35|         2.2|         891.0|    35.01|null|
|2018-10-29 21:28:...|           8.35|         2.2|         890.0|    36.15|null|
|2018-10-29 21:57:...|           8.35|         2.2|         510.0|    37.29|null|
|2018-10-29 22:20:...|           8.35|         2.2|         890.0|    33.88|null|
|2018-10-30 00:04:...|           8.31|         2.2|         809.0|    34.64|null|
|2018-10-30 11:27:...|           8.34|         2.2|         891.0|    38.42|null|
|2018-10-30 11:28:...|           8.35|         2.2|         891.0|    37.67|null|
|2018-10-30 11:29:...|           8.32|         2.2|         809.0|    38.05|null|
+--------------------+---------------+------------+--------------+---------+----+
only showing top 20 rows


In [22]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=["battery_voltage","pamp_voltage","system_voltage","pamp_temp"],
                                  outputCol="features")

In [23]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(2).setSeed(1)

In [24]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, kmeans])

In [25]:
model = pipeline.fit(df_cast_ts_ordered)

In [26]:
wssse = model.stages[1].computeCost(vectorAssembler.transform(df_cast_ts_ordered))
print("Within Set Sum of Squared Errors = " + str(wssse))


Within Set Sum of Squared Errors = 5822416.880138148

In [27]:
type(model.stages[1])


Out[27]:
pyspark.ml.clustering.KMeansModel

In [28]:
transformed =  model.transform(df_cast_ts_ordered)
transformed.select('prediction').distinct().show()


+----------+
|prediction|
+----------+
|         1|
|         0|
+----------+


In [29]:
transformed.createOrReplaceTempView('transformed')

In [30]:
spark.sql('select avg(system_voltage) from transformed where prediction = 0').show()


+-------------------+
|avg(system_voltage)|
+-------------------+
| 7.4811055684428185|
+-------------------+


In [31]:
spark.sql('select count(system_voltage) from transformed where prediction = 1 and system_voltage >100').show()


+---------------------+
|count(system_voltage)|
+---------------------+
|                  169|
+---------------------+


In [29]:
spark.sql('select count(system_voltage) from transformed where prediction = 1').show()


+---------------------+
|count(system_voltage)|
+---------------------+
|                  169|
+---------------------+