In [1]:
sc
Out[1]:
In [4]:
spark
Out[4]:
Figure out what data we have to work with
In [5]:
!gsutil ls gs://asf-diversity-data/
In [23]:
raw_github_data = spark.read.load("gs://asf-diversity-data/raw_git_data/")
In [24]:
github_data = raw_github_data.repartition(20).cache()
github_data.count()
Out[24]:
In [25]:
github_data.schema
Out[25]:
In [29]:
from pyspark.sql import functions as f
In [ ]:
In [30]:
github_data.limit(6).toPandas()
Out[30]:
In [70]:
# I use a map because I'm used to functional programming, but there are other options
# However element_at isn't in the Python DF API until 2.4 :(
from pyspark.sql import Row
authors = github_data.select(github_data.data).rdd.map(lambda row: Row(author=row['data']['Author'])).toDF()
In [71]:
authors.limit(5).toPandas()
Out[71]:
In [72]:
split_emails = authors.select(f.split(authors.author, "\.").alias("split_emails"))
In [80]:
def extract_tld(row):
last_elem = row['split_emails'][-1]
if last_elem[-1] == ">":
return last_elem[:-1]
else:
return last_elem
tlds = split_emails.rdd.map(extract_tld).map(lambda raw_tld: Row(tld=raw_tld)).toDF()
In [89]:
grouped = tlds.groupBy(tlds.tld).agg(f.count("*").alias("num"))
In [92]:
popular_tlds = grouped.orderBy(grouped.num.desc())
In [93]:
popular_tlds.limit(10).toPandas()
Out[93]:
In [19]:
from pyspark.sql.functions import upper
github_data.select(upper(github_data.category)).show()
In [ ]: