In [1]:
%%info
In [2]:
import pandas as pd
from pyspark import SparkFiles
In [3]:
# load from URL
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv'
spark.sparkContext.addFile(url)
# store to cases country RDD
cases_country = spark.read.csv("file://"+SparkFiles.get('cases_country.csv'), header=True, inferSchema=True)
cases_country.printSchema()
In [5]:
cases_country.select('ISO3', 'Country_Region').distinct().show()
In [7]:
cases_country.select(['ISO3', 'Country_Region', 'Confirmed', 'Recovered', 'Deaths', 'Active', 'Last_Update']).orderBy(cases_country.Active.desc()).show(20)
In [9]:
cases_country.createOrReplaceTempView('CasesCountryTable')
In [10]:
%%sql
select * from CasesCountryTable
In [11]:
%%sql
select * from CasesCountryTable where ISO3 == 'USA' or ISO3 == 'GBR' or ISO3 == 'ITA' or ISO3 == 'ESP' or ISO3 == 'FRA' or ISO3 = 'RUS'
In [ ]: