YELP review dataset: http://www.yelp.com/dataset_challenge
Note: we need to load spark-csv for CSV support
In [1]:
from pyspark import SparkContext
sc = SparkContext('local','example')
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
In [2]:
import pandas as pd
pandas_df = pd.read_csv('data/yelp_data.csv')
yelp_df = sqlContext.createDataFrame(pandas_df)
In [3]:
yelp_df.printSchema()
In [4]:
yelp_df.count()
Out[4]:
In [6]:
yelp_df.useful
Out[6]:
In [7]:
yelp_df["useful"]
Out[7]:
In [8]:
yelp_df.filter(yelp_df.stars > 3).count()
Out[8]:
In [9]:
yelp_df.select("stars").agg({"stars":"mean"}).collect()
Out[9]:
In [10]:
yelp_df.select("stars").agg({"stars":"max"}).collect()
Out[10]:
In [11]:
yelp_df.select("stars").agg({"stars":"min"}).collect()
Out[11]:
In [12]:
yelp_df.select("stars").agg({"stars":"min"}).show()
In [13]:
yelp_df.select("id", "useful", "stars").take(5)
Out[13]:
In [14]:
yelp_df.select('stars', yelp_df.stars*2.3).show(5)
In [15]:
yelp_df.select('stars', (yelp_df.stars*2.3).cast("int")).show(5)
In [16]:
yelp_df.select('stars', (yelp_df.stars*2.3).cast("int").alias('new_stars')).show(5)
In [17]:
from pyspark.sql.functions import asc, desc
yelp_df.select("id", "stars").orderBy(asc("stars")).show(10)
In [18]:
yelp_df.groupBy('state').count().show()
In [19]:
yelp_df.groupBy('state').avg('stars').show()
In [ ]: