In [ ]:
In [ ]:
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/RoadRatings2015.csv
In [ ]:
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/potholes.csv
In [ ]:
!pwd
In [ ]:
roadratings_df = sqlContext.read\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.option("nullValue", 'NA')\
.load("file:///home/ischool/spark_demo/RoadRatings2015.csv")
In [ ]:
potholes_df = sqlContext.read\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.option('nullValue', 'NA')\
.load("file:///home/ischool/spark_demo/potholes.csv")
In [ ]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
In [ ]:
rr_df = roadratings_df.select(['streetID', 'overall', 'crack', 'patch', 'length', 'width'])
In [ ]:
ph_df = potholes_df.select(['STREET_ID', 'Latitude', 'Longitude'])
In [ ]:
from pyspark.sql import functions
In [ ]:
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).show()
In [ ]:
## Your code here
In [ ]:
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).\
join(potholes_df.select(['STREET_ID', 'strLocation']), 'STREET_ID').show()
In [ ]:
rr_df.columns
In [ ]:
dataset_df = rr_df.join(ph_df, rr_df['streetID'] == ph_df['STREET_ID'])
In [ ]:
dataset_df.printSchema()
In [ ]:
lr = LinearRegression(featuresCol='features', labelCol='overall')
In [ ]:
va = VectorAssembler(inputCols=['length'], outputCol='features')
In [ ]:
pl = Pipeline(stages=[va, lr])
In [ ]:
input_dataset_df = dataset_df.select(dataset_df['length'].astype('double').alias('length'),
dataset_df['overall'].astype('double').alias('overall')
).dropna()
In [ ]:
training_df, testing_df = input_dataset_df.randomSplit([0.8, 0.2])
In [ ]:
training_df.count()
In [ ]:
testing_df.count()
In [ ]:
pl_fit = pl.fit(training_df)
In [ ]:
pl_fit.transform(training_df).show()
In [ ]:
training_df.count()
In [ ]:
testing_df.count()
In [ ]:
pl_fit.transform(testing_df).show()
In [ ]:
In [ ]: