In [1]:
!wget https://raw.githubusercontent.com/romeokienzler/uhack/master/projects/bsusat/telemetry3.json
In [2]:
df_data_1 = spark.read.json('telemetry3.json')
df_data_1.take(5)
Out[2]:
In [3]:
df_data_1.count()
Out[3]:
In [4]:
df = df_data_1
In [5]:
df.createOrReplaceTempView('df')
In [6]:
spark.sql('select * from df').show()
In [7]:
df.printSchema()
In [10]:
df_cast = df.select(
df.timestamp,
df.battery_voltage.cast("float"),
df.pamp_voltage.cast("float"),
df.system_voltage.cast("float"),
df.pamp_temp.cast("float")
)
In [11]:
df_cast.printSchema()
In [12]:
df_cast.show()
In [13]:
df_cast.createOrReplaceTempView('df_cast')
spark.sql("SELECT *, unix_timestamp(timestamp, 'yyyy-MM-ddTHH:mm:ss.SSSZ') from df_cast").show()
In [14]:
df_cast.withColumn("ts", unix_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(TimestampType))
In [15]:
from pyspark.sql.functions import from_unixtime
from pyspark.sql.types import DateType
df_cast_ts = df_cast.withColumn('ts', from_unixtime('timestamp').cast(DateType()))
In [16]:
df_cast_ts_ordered = df_cast_ts.orderBy('timestamp')
In [17]:
import pixiedust
display(df_cast_ts_ordered)
In [18]:
df_cast_ts_ordered.show()
In [22]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=["battery_voltage","pamp_voltage","system_voltage","pamp_temp"],
outputCol="features")
In [23]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(2).setSeed(1)
In [24]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
In [25]:
model = pipeline.fit(df_cast_ts_ordered)
In [26]:
wssse = model.stages[1].computeCost(vectorAssembler.transform(df_cast_ts_ordered))
print("Within Set Sum of Squared Errors = " + str(wssse))
In [27]:
type(model.stages[1])
Out[27]:
In [28]:
transformed = model.transform(df_cast_ts_ordered)
transformed.select('prediction').distinct().show()
In [29]:
transformed.createOrReplaceTempView('transformed')
In [30]:
spark.sql('select avg(system_voltage) from transformed where prediction = 0').show()
In [31]:
spark.sql('select count(system_voltage) from transformed where prediction = 1 and system_voltage >100').show()
In [29]:
spark.sql('select count(system_voltage) from transformed where prediction = 1').show()