In [1]:
sc


Out[1]:

SparkContext

Spark UI

Version
v2.2.1
Master
yarn
AppName
pyspark-shell

In [4]:
spark


Out[4]:

SparkSession - hive

SparkContext

Spark UI

Version
v2.2.1
Master
yarn
AppName
pyspark-shell

Figure out what data we have to work with


In [5]:
!gsutil ls gs://asf-diversity-data/


gs://asf-diversity-data/jupyter_config_hacks.py
gs://asf-diversity-data/jupyter_new.sh
gs://asf-diversity-data/active_sample_sizes/
gs://asf-diversity-data/apache_people.csv/
gs://asf-diversity-data/apache_people_cleaned_agg_by_gender/
gs://asf-diversity-data/asf_people_cleaned_agg_by_gender_and_proj/
gs://asf-diversity-data/authors_grouped_by_id/
gs://asf-diversity-data/authors_to_github/
gs://asf-diversity-data/distinct_authors_latest_commit/
gs://asf-diversity-data/formatted_sample_csv/
gs://asf-diversity-data/formatted_sample_pq/
gs://asf-diversity-data/human_data/
gs://asf-diversity-data/human_data_cleaned/
gs://asf-diversity-data/infered_gender_for_authors_pq/
gs://asf-diversity-data/infered_gender_for_recent_authors_pq/
gs://asf-diversity-data/joined_sample/
gs://asf-diversity-data/mailing_list_info/
gs://asf-diversity-data/posts_by_sampled_authors/
gs://asf-diversity-data/posts_by_user/
gs://asf-diversity-data/processed_mbox_data/
gs://asf-diversity-data/projects/
gs://asf-diversity-data/projects_result/
gs://asf-diversity-data/raw_git_data/
gs://asf-diversity-data/sample_sizes/
gs://asf-diversity-data/sampled_authors/
gs://asf-diversity-data/sampled_contirbutors_human_agg_by_gender_and_proj/

In [23]:
raw_github_data = spark.read.load("gs://asf-diversity-data/raw_git_data/")

In [24]:
github_data = raw_github_data.repartition(20).cache()
github_data.count()


Out[24]:
1555064

In [25]:
github_data.schema


Out[25]:
StructType(List(StructField(backend_name,StringType,true),StructField(backend_version,StringType,true),StructField(category,StringType,true),StructField(data,MapType(StringType,StringType,true),true),StructField(origin,StringType,true),StructField(perceval_version,StringType,true),StructField(project_name,StringType,true),StructField(tag,StringType,true),StructField(timestamp,DoubleType,true),StructField(updated_on,DoubleType,true),StructField(uuid,StringType,true)))

In [29]:
from pyspark.sql import functions as f

In [ ]:


In [30]:
github_data.limit(6).toPandas()


Out[30]:
backend_name backend_version category data origin perceval_version project_name tag timestamp updated_on uuid
0 Git 0.10.2 commit {'Commit': 'Martin Ritchie <ritchiem@apache.or... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.158764e+09 68dd458759540e7649a5c970937ddd710afd166f
1 Git 0.10.2 commit {'Commit': 'Martin Ritchie <ritchiem@apache.or... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.159348e+09 db0f4b7f409abc534b4b90b4814685fd91645a79
2 Git 0.10.2 commit {'Commit': 'Martin Ritchie <ritchiem@apache.or... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.160049e+09 59ab9e2a41c0e2f9c4f73e5c37a3262c05b46ed4
3 Git 0.10.2 commit {'Commit': 'Robert Greig <rgreig@apache.org>',... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.160684e+09 b13f951ada229fb15aa122f2cba58e0a367d0086
4 Git 0.10.2 commit {'Commit': 'Alan Conway <aconway@apache.org>',... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.161215e+09 bce67ae826132c4008aca047e74026b29fee3b52
5 Git 0.10.2 commit {'Commit': 'Gordon Sim <gsim@apache.org>', 'Au... https://github.com/apache/qpid.git 0.9.16 qpid https://github.com/apache/qpid.git 1.523656e+09 1.161768e+09 e0291768664ef9e6691efe4946ca3adb154bedc6

In [70]:
# I use a map because I'm used to functional programming, but there are other options
# However element_at isn't in the Python DF API until 2.4 :(
from pyspark.sql import Row
authors = github_data.select(github_data.data).rdd.map(lambda row: Row(author=row['data']['Author'])).toDF()

In [71]:
authors.limit(5).toPandas()


Out[71]:
author
0 Martin Ritchie <ritchiem@apache.org>
1 Martin Ritchie <ritchiem@apache.org>
2 Martin Ritchie <ritchiem@apache.org>
3 Robert Greig <rgreig@apache.org>
4 Alan Conway <aconway@apache.org>

In [72]:
split_emails = authors.select(f.split(authors.author, "\.").alias("split_emails"))

In [80]:
def extract_tld(row):
    last_elem = row['split_emails'][-1]
    if last_elem[-1] == ">":
        return last_elem[:-1]
    else:
        return last_elem


tlds = split_emails.rdd.map(extract_tld).map(lambda raw_tld: Row(tld=raw_tld)).toDF()

In [89]:
grouped = tlds.groupBy(tlds.tld).agg(f.count("*").alias("num"))

In [92]:
popular_tlds = grouped.orderBy(grouped.num.desc())

In [93]:
popular_tlds.limit(10).toPandas()


Out[93]:
tld num
0 org 1076444
1 com 383104
2 net 13682
3 io 12934
4 de 7849
5 uk 6434
6 edu 6351
7 au 6201
8 spamassassin_role <spamassassin_role@unknown 2419
9 hu 1808

In [19]:
from pyspark.sql.functions import upper
github_data.select(upper(github_data.category)).show()


+---------------+
|upper(category)|
+---------------+
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
+---------------+
only showing top 20 rows


In [ ]: