notebook.community

Edit and run



In [1]:

    
sc









    Out[1]:





        
            SparkContext

            Spark UI

            
              Version
                v2.2.1
              Master
                yarn
              AppName
                pyspark-shell



In [4]:

    
spark









    Out[4]:





            
                SparkSession - hive
                
        
            SparkContext

            Spark UI

            
              Version
                v2.2.1
              Master
                yarn
              AppName
                pyspark-shell

Figure out what data we have to work with



In [5]:

    
!gsutil ls gs://asf-diversity-data/









    



gs://asf-diversity-data/jupyter_config_hacks.py
gs://asf-diversity-data/jupyter_new.sh
gs://asf-diversity-data/active_sample_sizes/
gs://asf-diversity-data/apache_people.csv/
gs://asf-diversity-data/apache_people_cleaned_agg_by_gender/
gs://asf-diversity-data/asf_people_cleaned_agg_by_gender_and_proj/
gs://asf-diversity-data/authors_grouped_by_id/
gs://asf-diversity-data/authors_to_github/
gs://asf-diversity-data/distinct_authors_latest_commit/
gs://asf-diversity-data/formatted_sample_csv/
gs://asf-diversity-data/formatted_sample_pq/
gs://asf-diversity-data/human_data/
gs://asf-diversity-data/human_data_cleaned/
gs://asf-diversity-data/infered_gender_for_authors_pq/
gs://asf-diversity-data/infered_gender_for_recent_authors_pq/
gs://asf-diversity-data/joined_sample/
gs://asf-diversity-data/mailing_list_info/
gs://asf-diversity-data/posts_by_sampled_authors/
gs://asf-diversity-data/posts_by_user/
gs://asf-diversity-data/processed_mbox_data/
gs://asf-diversity-data/projects/
gs://asf-diversity-data/projects_result/
gs://asf-diversity-data/raw_git_data/
gs://asf-diversity-data/sample_sizes/
gs://asf-diversity-data/sampled_authors/
gs://asf-diversity-data/sampled_contirbutors_human_agg_by_gender_and_proj/



In [23]:

    
raw_github_data = spark.read.load("gs://asf-diversity-data/raw_git_data/")



In [24]:

    
github_data = raw_github_data.repartition(20).cache()
github_data.count()









    Out[24]:





1555064



In [25]:

    
github_data.schema









    Out[25]:





StructType(List(StructField(backend_name,StringType,true),StructField(backend_version,StringType,true),StructField(category,StringType,true),StructField(data,MapType(StringType,StringType,true),true),StructField(origin,StringType,true),StructField(perceval_version,StringType,true),StructField(project_name,StringType,true),StructField(tag,StringType,true),StructField(timestamp,DoubleType,true),StructField(updated_on,DoubleType,true),StructField(uuid,StringType,true)))



In [29]:

    
from pyspark.sql import functions as f



In [ ]:



In [30]:

    
github_data.limit(6).toPandas()









    Out[30]:







  
    
      
      backend_name
      backend_version
      category
      data
      origin
      perceval_version
      project_name
      tag
      timestamp
      updated_on
      uuid
    
  
  
    
      0
      Git
      0.10.2
      commit
      {'Commit': 'Martin Ritchie <ritchiem@apache.or...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.158764e+09
      68dd458759540e7649a5c970937ddd710afd166f
    
    
      1
      Git
      0.10.2
      commit
      {'Commit': 'Martin Ritchie <ritchiem@apache.or...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.159348e+09
      db0f4b7f409abc534b4b90b4814685fd91645a79
    
    
      2
      Git
      0.10.2
      commit
      {'Commit': 'Martin Ritchie <ritchiem@apache.or...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.160049e+09
      59ab9e2a41c0e2f9c4f73e5c37a3262c05b46ed4
    
    
      3
      Git
      0.10.2
      commit
      {'Commit': 'Robert Greig <rgreig@apache.org>',...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.160684e+09
      b13f951ada229fb15aa122f2cba58e0a367d0086
    
    
      4
      Git
      0.10.2
      commit
      {'Commit': 'Alan Conway <aconway@apache.org>',...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.161215e+09
      bce67ae826132c4008aca047e74026b29fee3b52
    
    
      5
      Git
      0.10.2
      commit
      {'Commit': 'Gordon Sim <gsim@apache.org>', 'Au...
      https://github.com/apache/qpid.git
      0.9.16
      qpid
      https://github.com/apache/qpid.git
      1.523656e+09
      1.161768e+09
      e0291768664ef9e6691efe4946ca3adb154bedc6



In [70]:

    
# I use a map because I'm used to functional programming, but there are other options
# However element_at isn't in the Python DF API until 2.4 :(
from pyspark.sql import Row
authors = github_data.select(github_data.data).rdd.map(lambda row: Row(author=row['data']['Author'])).toDF()



In [71]:

    
authors.limit(5).toPandas()









    Out[71]:







  
    
      
      author
    
  
  
    
      0
      Martin Ritchie <ritchiem@apache.org>
    
    
      1
      Martin Ritchie <ritchiem@apache.org>
    
    
      2
      Martin Ritchie <ritchiem@apache.org>
    
    
      3
      Robert Greig <rgreig@apache.org>
    
    
      4
      Alan Conway <aconway@apache.org>



In [72]:

    
split_emails = authors.select(f.split(authors.author, "\.").alias("split_emails"))



In [80]:

    
def extract_tld(row):
    last_elem = row['split_emails'][-1]
    if last_elem[-1] == ">":
        return last_elem[:-1]
    else:
        return last_elem


tlds = split_emails.rdd.map(extract_tld).map(lambda raw_tld: Row(tld=raw_tld)).toDF()



In [89]:

    
grouped = tlds.groupBy(tlds.tld).agg(f.count("*").alias("num"))



In [92]:

    
popular_tlds = grouped.orderBy(grouped.num.desc())



In [93]:

    
popular_tlds.limit(10).toPandas()









    Out[93]:







  
    
      
      tld
      num
    
  
  
    
      0
      org
      1076444
    
    
      1
      com
      383104
    
    
      2
      net
      13682
    
    
      3
      io
      12934
    
    
      4
      de
      7849
    
    
      5
      uk
      6434
    
    
      6
      edu
      6351
    
    
      7
      au
      6201
    
    
      8
      spamassassin_role <spamassassin_role@unknown
      2419
    
    
      9
      hu
      1808



In [19]:

    
from pyspark.sql.functions import upper
github_data.select(upper(github_data.category)).show()









    



+---------------+
|upper(category)|
+---------------+
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
|         COMMIT|
+---------------+
only showing top 20 rows



In [ ]:

	backend_name	backend_version	category	data	origin	perceval_version	project_name	tag	timestamp	updated_on	uuid
0	Git	0.10.2	commit	{'Commit': 'Martin Ritchie <ritchiem@apache.or...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.158764e+09	68dd458759540e7649a5c970937ddd710afd166f
1	Git	0.10.2	commit	{'Commit': 'Martin Ritchie <ritchiem@apache.or...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.159348e+09	db0f4b7f409abc534b4b90b4814685fd91645a79
2	Git	0.10.2	commit	{'Commit': 'Martin Ritchie <ritchiem@apache.or...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.160049e+09	59ab9e2a41c0e2f9c4f73e5c37a3262c05b46ed4
3	Git	0.10.2	commit	{'Commit': 'Robert Greig <rgreig@apache.org>',...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.160684e+09	b13f951ada229fb15aa122f2cba58e0a367d0086
4	Git	0.10.2	commit	{'Commit': 'Alan Conway <aconway@apache.org>',...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.161215e+09	bce67ae826132c4008aca047e74026b29fee3b52
5	Git	0.10.2	commit	{'Commit': 'Gordon Sim <gsim@apache.org>', 'Au...	https://github.com/apache/qpid.git	0.9.16	qpid	https://github.com/apache/qpid.git	1.523656e+09	1.161768e+09	e0291768664ef9e6691efe4946ca3adb154bedc6

	author
0	Martin Ritchie <ritchiem@apache.org>
1	Martin Ritchie <ritchiem@apache.org>
2	Martin Ritchie <ritchiem@apache.org>
3	Robert Greig <rgreig@apache.org>
4	Alan Conway <aconway@apache.org>

	tld	num
0	org	1076444
1	com	383104
2	net	13682
3	io	12934
4	de	7849
5	uk	6434
6	edu	6351
7	au	6201
8	spamassassin_role <spamassassin_role@unknown	2419
9	hu	1808