In [1]:
import findspark
findspark.init('/Users/hanlei/Downloads/spark-2.2.0-bin-hadoop2.7')
In [2]:
from pyspark.sql import SparkSession
In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()
In [5]:
df = spark.read.json('/Users/hanlei/Downloads/spark-2.2.0-bin-hadoop2.7/python/test_support/sql/people.json')
In [6]:
df.show()
In [7]:
df.printSchema()
In [8]:
df.columns
Out[8]:
In [9]:
df.describe()
Out[9]:
In [10]:
df.describe().show()
In [11]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
In [16]:
data_schema = [StructField('age', IntegerType(), True),
StructField('name', StringType(), True)]
In [18]:
final_struc = StructType(fields=data_schema)
In [19]:
df = spark.read.json('/Users/hanlei/Downloads/spark-2.2.0-bin-hadoop2.7/python/test_support/sql/people.json',
schema=final_struc)
In [20]:
df.printSchema()
In [21]:
df.show()
In [22]:
df['age']
Out[22]:
In [23]:
type(df['age'])
Out[23]:
In [24]:
df.select('age')
Out[24]:
In [25]:
df.select('age').show()
In [26]:
type(df.select('age'))
Out[26]:
In [27]:
df.head(2)
Out[27]:
In [28]:
df.head(2)[0]
Out[28]:
In [29]:
type(df.head(2)[0])
Out[29]:
In [30]:
df.select(['age', 'name']).show()
In [35]:
df.withColumn('double_age', df['age']*2).show()
In [36]:
df.show()
In [37]:
df.withColumnRenamed('age','my_new_age').show()
In [38]:
df.createOrReplaceTempView('people')
In [39]:
results = spark.sql("SELECT * FROM people")
In [40]:
results.show()
In [41]:
new_result = spark.sql("SELECT * FROM people WHERE age=30")
In [42]:
new_result.show()
In [ ]:
In [ ]:
In [ ]: