In [24]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import itertools
import numpy as np

# read in dataset
df = pd.read_csv("data_sources/GitHub_Terms_Data.csv", sep=';', encoding='ISO-8859-1')

In [16]:
# print first 25 rows
df.head(25)


Out[16]:
row_id project_id owner_login name language created_at domain forks watchers total_committers ... f_known_country blau_gender blau_country med_gh_tenure med_prj_tenure med_cmt_tenure cv_gh_tenure cv_prj_tenure cv_cmt_tenure turnover
0 21 1575238 MetaModels attribute_checkbox PHP 2012-11-23 17:26:25 OTHER 3 0 6 ... 0.500 0.000 0.000 373.0 1.0 935.0 0.029 0.000 0.673 1.000
1 22 1575238 MetaModels attribute_checkbox PHP 2012-11-23 17:26:25 OTHER 3 0 6 ... 0.750 0.375 0.000 474.5 1.0 428.0 0.439 0.346 0.862 0.750
2 23 1575238 MetaModels attribute_checkbox PHP 2012-11-23 17:26:25 OTHER 3 0 6 ... 0.250 0.000 0.000 613.5 2.0 598.5 0.705 0.577 0.870 0.250
3 24 1575238 MetaModels attribute_checkbox PHP 2012-11-23 17:26:25 OTHER 3 0 6 ... 0.250 0.000 0.000 609.5 3.0 741.0 0.683 0.527 0.812 0.250
4 49 5989 soundcloud soundcloud-mac-sharing Objective-C 2012-03-21 04:02:48 APPLICATION 4 8 4 ... 1.000 0.000 0.000 968.0 1.0 145.0 0.000 0.000 0.000 1.000
5 50 5989 soundcloud soundcloud-mac-sharing Objective-C 2012-03-21 04:02:48 APPLICATION 4 8 4 ... 0.667 0.000 0.000 1090.0 1.0 910.0 0.231 0.000 0.349 1.000
6 107 2352133 ooici ooici-pres JavaScript 2010-12-07 09:12:28 OTHER 3 0 7 ... 0.000 0.000 0.000 113.5 1.0 137.5 1.140 0.000 0.876 1.000
7 108 2352133 ooici ooici-pres JavaScript 2010-12-07 09:12:28 OTHER 3 0 7 ... 0.000 0.000 0.000 0.0 1.0 40.0 1.414 0.354 1.108 0.333
8 115 4321928 sebcrozet kiss3d Rust 2013-06-14 22:41:40 OTHER 6 50 5 ... 0.000 0.000 0.000 851.0 1.0 585.0 0.476 0.000 0.254 0.500
9 116 4321928 sebcrozet kiss3d Rust 2013-06-14 22:41:40 OTHER 6 50 5 ... 0.000 0.320 0.000 1024.5 1.0 857.0 0.463 0.354 0.503 0.500
10 181 1872989 gevans sidekiq-throttler Ruby 2012-12-12 01:30:19 OTHER 15 34 10 ... 1.000 0.000 0.500 1408.0 1.0 820.5 0.199 0.000 0.090 1.000
11 182 1872989 gevans sidekiq-throttler Ruby 2012-12-12 01:30:19 OTHER 15 34 10 ... 0.500 0.000 0.500 1059.0 1.0 596.0 0.538 0.346 0.654 0.500
12 183 1872989 gevans sidekiq-throttler Ruby 2012-12-12 01:30:19 OTHER 15 34 10 ... 0.400 0.000 0.000 1373.0 1.0 992.0 0.351 0.571 0.505 0.600
13 184 1872989 gevans sidekiq-throttler Ruby 2012-12-12 01:30:19 OTHER 15 34 10 ... 0.571 0.000 0.375 1476.0 1.0 981.0 0.299 0.668 0.458 0.429
14 202 5664651 minghuadev chromeos-kernel-3-8 C 2013-09-07 07:15:39 OTHER 0 0 13 ... 0.000 0.000 0.000 0.0 1.0 223.0 NaN 0.000 0.000 1.000
15 203 5664651 minghuadev chromeos-kernel-3-8 C 2013-09-07 07:15:39 OTHER 0 0 13 ... 0.000 0.000 0.000 0.0 1.0 370.0 2.000 0.000 0.440 1.000
16 204 5664651 minghuadev chromeos-kernel-3-8 C 2013-09-07 07:15:39 OTHER 0 0 13 ... 0.000 0.000 0.000 0.0 1.0 547.0 NaN 0.000 0.000 1.000
17 205 5664651 minghuadev chromeos-kernel-3-8 C 2013-09-07 07:15:39 OTHER 0 0 13 ... 0.143 0.000 0.000 0.0 1.0 624.0 NaN 0.808 0.018 0.857
18 310 3917990 janestreet re2 C++ 2013-05-21 07:40:57 OTHER 2 1 5 ... 0.500 0.000 0.000 741.5 1.0 1212.0 0.165 0.000 0.320 1.000
19 311 3917990 janestreet re2 C++ 2013-05-21 07:40:57 OTHER 2 1 5 ... 0.667 0.000 0.750 1190.0 1.0 1423.5 0.287 0.319 0.573 0.333
20 312 3579955 Virtex7 led-wordclock C 2013-05-05 01:03:57 OTHER 5 3 7 ... 0.000 0.000 0.000 49.0 1.0 47.5 0.374 0.000 0.373 1.000
21 313 3579955 Virtex7 led-wordclock C 2013-05-05 01:03:57 OTHER 5 3 7 ... 0.000 0.000 0.000 153.0 2.0 153.0 0.000 0.000 0.000 1.000
22 370 3437581 kolibre libkolibre-clientcore C++ 2013-04-08 20:58:51 LIBRARY 1 0 4 ... 0.333 0.444 0.000 383.0 1.0 358.0 0.000 0.000 0.581 1.000
23 371 3437581 kolibre libkolibre-clientcore C++ 2013-04-08 20:58:51 LIBRARY 1 0 4 ... 0.000 0.500 0.000 501.0 2.0 487.0 0.000 0.000 0.021 0.500
24 372 3437586 kolibre libkolibre-builder Shell 2013-04-11 03:37:36 GUI 1 0 4 ... 0.333 0.444 0.000 382.0 1.0 358.0 0.000 0.000 0.583 1.000

25 rows × 46 columns


In [5]:
# list column values
list(df.columns.values)


Out[5]:
['row_id',
 'project_id',
 'owner_login',
 'name',
 'language',
 'created_at',
 'domain',
 'forks',
 'watchers',
 'total_committers',
 'total_commits',
 'project_age',
 'windows',
 'window_idx',
 'num_commits',
 'num_pull_req',
 'num_comments',
 'num_issues',
 'num_committers',
 'num_team',
 'committers',
 'commits',
 'team',
 'project_tenures',
 'github_tenures',
 'commit_tenures',
 'genders',
 'countries',
 'left',
 'stayed',
 'joined',
 'male',
 'female',
 'unknown',
 'has_woman',
 'f_known_gender',
 'f_known_country',
 'blau_gender',
 'blau_country',
 'med_gh_tenure',
 'med_prj_tenure',
 'med_cmt_tenure',
 'cv_gh_tenure',
 'cv_prj_tenure',
 'cv_cmt_tenure',
 'turnover']

In [6]:
# list top 10 users
df["owner_login"].value_counts()[:10]


Out[6]:
boostorg             1566
mozilla               602
GNOME                 594
apache                549
opscode-cookbooks     539
plone                 462
nuxeo                 315
yast                  315
alphagov              298
RBSChange             290
Name: owner_login, dtype: int64

In [7]:
# create visualization for gender contributions
genderList = []
for val in df["genders"]:
    genderList.append(val)

totalfCount = []
totalmCount = [] 
for gender in genderList:
    fCount = gender.count("female")
    totalfCount.append(fCount)
    mCount = gender.count("male")
    totalmCount.append(mCount)
    
labels = [r'Male Contributors (96%)', r'Female Contributors (4%)']
sizes = [sum(totalmCount), sum(totalfCount)]
colors = ['yellowgreen','lightcoral']
patches, texts = plt.pie(sizes, colors=colors, startangle=90)
plt.legend(patches, labels, loc="best")
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.tight_layout()
plt.show()



In [19]:
# create dictionary of language and genders
languages = {}
for valOne, valTwo in zip(df["language"], df["genders"]):
    try:
        languages[valOne].append(valTwo)
    except KeyError:
        languages[valOne] = [valTwo]

maleCount = 0
femaleCount = 0
for key, val in languages.items():
    for item in val:
        if "male" in item:
            maleCount +=1
        if "female" in item:
            femaleCount += 1
    languages[key] = (maleCount, femaleCount)

In [20]:
languages


Out[20]:
{'ASP': (19716, 2716),
 'ActionScript': (20097, 2797),
 'Ada': (27617, 3793),
 'Apex': (26593, 3678),
 'AppleScript': (74625, 10264),
 'Arduino': (75430, 10351),
 'Assembly': (26631, 3681),
 'Augeas': (17132, 2404),
 'BlitzBasic': (68097, 9426),
 'Boo': (29413, 3954),
 'Bro': (75380, 10344),
 'C': (74614, 10264),
 'C#': (67975, 9412),
 'C++': (25371, 3527),
 'CSS': (19688, 2715),
 'Ceylon': (19747, 2721),
 'Clojure': (89183, 12254),
 'CoffeeScript': (26586, 3678),
 'ColdFusion': (27611, 3787),
 'Common Lisp': (17112, 2404),
 'Coq': (68006, 9418),
 'Crystal': (66362, 9235),
 'D': (19131, 2649),
 'DOT': (19691, 2715),
 'Dart': (28714, 3890),
 'Delphi': (38081, 5072),
 'Dylan': (62960, 8673),
 'Ecl': (89188, 12254),
 'Eiffel': (11, 2),
 'Elixir': (27759, 3811),
 'Emacs Lisp': (27400, 3751),
 'Erlang': (38058, 5072),
 'F#': (17253, 2429),
 'FORTRAN': (27545, 3783),
 'Go': (18793, 2602),
 'Groovy': (89730, 12335),
 'HaXe': (89235, 12260),
 'Haskell': (28655, 3878),
 'Haxe': (89308, 12271),
 'Io': (17256, 2429),
 'Java': (45948, 6297),
 'JavaScript': (16938, 2395),
 'Julia': (27720, 3803),
 'Kotlin': (88139, 12159),
 'LiveScript': (18154, 2497),
 'Logos': (68012, 9420),
 'Lua': (25580, 3558),
 'Matlab': (17229, 2429),
 'Monkey': (66364, 9236),
 'Nimrod': (38087, 5077),
 'None': (66331, 9224),
 'OCaml': (19439, 2663),
 'Objective-C': (1768, 193),
 'Objective-J': (37426, 4962),
 'Opa': (17135, 2404),
 'OpenEdge ABL': (29402, 3953),
 'PHP': (37334, 4961),
 'Parrot': (68166, 9439),
 'Perl': (62927, 8672),
 'PowerShell': (68074, 9425),
 'Processing': (66342, 9226),
 'Prolog': (27438, 3755),
 'Puppet': (75375, 10344),
 'Pure Data': (27622, 3793),
 'Python': (88073, 12154),
 'R': (19064, 2640),
 'Racket': (89286, 12269),
 'Ruby': (61026, 8365),
 'Rust': (66496, 9249),
 'Scala': (29399, 3952),
 'Scheme': (88127, 12157),
 'Scilab': (66358, 9234),
 'Shell': (92879, 12753),
 'Slash': (27725, 3805),
 'Smalltalk': (37412, 4962),
 'Standard ML': (75400, 10345),
 'SuperCollider': (18801, 2604),
 'Tcl': (25614, 3559),
 'TeX': (62970, 8675),
 'TypeScript': (68147, 9434),
 'VHDL': (26650, 3683),
 'Vala': (19861, 2748),
 'Verilog': (37371, 4961),
 'VimL': (18149, 2496),
 'Visual Basic': (88170, 12165),
 'Volt': (19445, 2663),
 'XC': (25594, 3558),
 'XML': (19989, 2782),
 'XProc': (27723, 3805),
 'XQuery': (89322, 12272),
 'XSLT': (19738, 2721),
 'Xtend': (26633, 3681),
 'ooc': (18796, 2603)}

In [60]:
# list of all languages in the dataset
def all_languages(languages):
    all = []
    for i, j in languages.items():
        return

In [ ]: