User Search

For use to:

  1. Try to find an account based on random knowledge
  2. List all orgs they belong to (from a subset)
    • You will need org owner permissions to perform these searches

Boiler plate

Skip/hide this. Common usage is below.


In [ ]:
print(github3.__version__)
print(github3.__file__)

If you see this text, you may want to enable the nbextension "Collapsable Headings", so you can hide this in common usage.


In [ ]:
# set values here - you can also override below

# get api key from environment, fall back to file
import os
api_key = os.environ.get("GITHUB_PAT", "")
if not api_key:
    api_key = open(".credentials", "r").readlines()[1].strip()
if not api_key:
    raise OSError("no GitHub PAT found")

In [ ]:
orgs_to_check = [  "mozilla"
                 , "mozilla-services"
                 , "mozilla-l10n"
                 , "mozilla-mobile"
                 , "mozilla-partners"
                 , "taskcluster"
                 , "mozilla-conduit"
                 , "mozilla-lockwise"
                 , "mozilla-platform-ops"
                 , "nss-dev"
                 , "mozilla-releng"
                 , "mozilla-private"
                 , "mozilla-frontend-infra"
                 , "mozilla-bteam"
                 , "iodide-project"
                 , "mozilla-games"
                 , "mozillaReality"
                 , "mozilla-standards"
                 , "mozilla-tw"
                 , "mozilla-extensions"
                ]

In [ ]:
import github3
def print_limits():
    print("reset at: {}, remaining {}".format(gh.rate_limit()["rate"]["reset"], gh.rate_limit()["rate"]["remaining"]))
try:
    gh = github3.login(token=api_key)
    print("You are authenticated as {}".format(gh.me().login))
except ConnectionError:
    print_limits()
try:
    from functools import lru_cache
except ImportError:
    from backports.functools_lru_cache import lru_cache

From here on, use gh to access all data


In [ ]:
@lru_cache(maxsize=32)
def _search_for_user(user):
    l = list(gh.search_users(query="type:user "+user))
    print("found {} potentials for {}".format(len(l), user))
    return l

def get_user_counts(user):
    l = _search_for_user(user)
    for u in l:
        yield u

In [ ]:
displayed_users = set() # cache to avoid duplicate output
def show_users(user_list, search_term):
    global displayed_users
    unique_users = set(user_list)
    count = len(unique_users)
    if count >10:
        # Even if there are too many, we still want to check the 'root' term
        print("... too many to be useful, still trying '{}' ...".format(search_term))
        displayed_users.add(search_term)
    else:
        for u in [x for x in unique_users if not x in displayed_users]:
            displayed_users.add(u)
            user = u.user.refresh()
            print(user.login, user.name, user.location, user.email)
    if 0 < count <= 10:
        return [u.login for u in unique_users]
    else:
        return []

def gather_possibles(seed):
    found = set()
    maybes = show_users(get_user_counts(seed), seed)
    found.update(maybes)
    # if it was an email addr, try again with the mailbox name
    if '@' in seed:
        seed2 = seed.split('@')[0]
        maybes = show_users(get_user_counts(seed2), seed2)
        found.update(maybes)
    return found

In [ ]:
class OutsideCollaboratorIterator(github3.structs.GitHubIterator):
    def __init__(self, org):
        super(OutsideCollaboratorIterator, self).__init__(
            count=-1, #get all
            url=org.url + "/outside_collaborators",
            cls=github3.users.ShortUser,
            session=org.session,
        )

@lru_cache(maxsize=32)
def get_collaborators(org):
    collabs = [x.login.lower() for x in OutsideCollaboratorIterator(org)]
    return collabs

def is_collaborator(org, login):
    return bool(login.lower() in get_collaborators(org))

# provide same interface for members -- but the iterator is free :D
@lru_cache(maxsize=32)
def get_members(org):
    collabs = [x.login.lower() for x in org.members()]
    return collabs

def is_member(org, login):
    return bool(login.lower() in get_members(org))

In [ ]:
def check_login_perms(logins):
    any_perms = False
    for login in logins:
        is_collab = False
        for org in orgs_to_check:
            o = gh.organization(org)
            if is_member(o, login):
                url = "https://github.com/orgs/{}/people?utf8=%E2%9C%93&query={}".format(o.login, login)
                print("{} has {} as a member: {}".format(o.login, login, url))
                is_collab = True
            if is_collaborator(o, login):
                url = "https://github.com/orgs/{}/outside-collaborators?utf8=%E2%9C%93&query={}".format(o.login, login)
                print("{} has {} as a collaborator: {}".format(o.login, login, url))
                is_collab = True
        if is_collab:
            any_perms = True
        else:
            print("No permissions found for {}".format(login))
    return any_perms

In [ ]:
import re
import os

re_flags = re.MULTILINE | re.IGNORECASE

byte_wrapper = re.compile(r"""^b'(?P<real_text>.*)'""")

def process_from_email(email_body):
    # get rid of white space
    email_body = os.linesep.join(
        [s.strip() for s in email_body.splitlines() if s.strip()]
    )

    user = set()
    
    # Extract data from internal email format
    match = re.search(r'^Full Name: (?P<full_name>\S.*)$', email_body, re_flags)
    if match:
        # add base and some variations
        full_name = match.group("full_name")
        user.add(full_name)
        # remove spaces
        user.add(full_name.replace(' ', ''))
        # reversed no spaces
        user.add(''.join(full_name.split()[::-1]))

    match = re.search(r'^Email: (?P<primary_email>.*)$', email_body, re_flags)
    primary_email = match.group("primary_email") if match else None
    user.add(primary_email)
    print("Check these URLs for Heroku activity:")
    print("  Mozillians: https://mozillians.org/en-US/search/?q={}".format(primary_email.replace('@', '%40')))
    print("  Heroku: https://dashboard.heroku.com/teams/mozillacorporation/access?filter={}".format(primary_email.replace('@', '%40')))
    print(email_body)

    match = re.search(r'^Github Profile: (?P<github_profile>.*)$', email_body, re_flags)
    declared_github = match.group("github_profile") if match else None
    user.add(declared_github)

    match = re.search(r'^Zimbra Alias: (?P<other_email>.*)$', email_body, re_flags)
    user.add(match.group("other_email") if match else None)

    # we consider each token in the IM line as a possible GitHub login
    match = re.search(r'^IM:\s*(.*)$', email_body, re_flags)
    if match:
        im_line = match.groups()[0]
        matches = re.finditer(r'\W*((\w+)(?:\s+\w+)*)', im_line)
        user.update([x.group(1) for x in matches] if matches else None)

    match = re.search(r'^Bugzilla Email: (?P<bz_email>.*)$', email_body, re_flags)
    user.add(match.group("bz_email") if match else None)
    
    # grab the department name, for a heuristic on whether we expect to find perms
    expect_github_login = False
    match = re.search(r'^\s*Dept Name: (?P<dept_name>\S.*)$', email_body, re_flags)
    if match:
        department_name = match.groups()[0].lower()
        dept_keys_infering_github = ["firefox", "engineering", "qa", "operations"]
        for key in dept_keys_infering_github:
            if key in department_name:
                expect_github_login = True
                break
    

    # clean up some noise, case insensitively
    # the tokens to ignore are added based on discovery,
    # they tend to cause the searches to get rate limited.
    user = {x.lower() for x in user if x and (len(x) > 2)}
    user = user - {None, "irc", "slack", "skype", "b", 'hotmail', 'mozilla', 'ro', 'com', 'softvision', 'mail', 
                  'twitter', 'blog', 'https', 'jabber', 'net', 'github', 'gmail',
                  'facebook', 'guy', 'pdx', 'yahoo', 'aim', 'whatsapp' }
    global displayed_users
    displayed_users = set()
    try:
        print("Trying '{}'".format("', '".join(user)))
        guesses = set()
        for term in user:
            # some text strings are displayed as "b'<real_text>'"
            # strip to just "<real_text>"
            match = byte_wrapper.search(term)
            if match:
                term = match.group('real_text')
            possibles = gather_possibles(term)
            guesses.update({x.lower() for x in possibles})
        # include declared_github if it exists
        if declared_github:
            guesses.add(declared_github.lower())
        print("Checking logins {}".format(guesses))
        found_perms = False
        if len(guesses):
            found_perms = check_login_perms(guesses)
        elif expect_github_login:
            print("\nWARNING: expected GitHub login for dept '{}'".format(department_name))
        print("Finished all reporting.")
        if declared_github and not found_perms:
            # print some text to copy/paste into email
            print(", even for declared login '{}'.".format(declared_github))
        if expect_github_login and not found_perms:
            print("WARNING: expected GitHub permissions for dept '{}'".format(department_name))
    except github3.exceptions.ForbiddenError as e:
        print("API limit reached, try again in 5 minutes.\n")
        print(str(e))
        print(gh.rate_limit())

Start of common usage

Currently, there a two common use cases:

  • processing an offboarding email, and
  • adhoc lookup of GitHub login

For anything else, you're on your own!

All usage requires the following setup:

  1. Fill in a way to load your PAT token in the first code cell
  2. Fill in the list of orgs to check in the second code cell

Process offboarding email

Usage steps - for each user:

1. Copy entire text of email
2. Paste between the ``"""`` marks in the cell below.
3. Execute that cell

The cell below should have the following text:

process_from_email(r"""
  # paste email body here
""")

Or if you're not processing an email, fake the two fields 'email:' and 'im:':

process_from_email(r"""
# comma separated list
im: various possible names comma
# Only 1 email
email: primary_email@mozilla.com
""")

In [ ]:
process_from_email(r"""



""")

Adhoc Lookup

Fill in list of the desired logins in the cell below


In [ ]:
check_login_perms([

])

notes

  • check invites as well, using manage_invitations.ps
  • code doesn't handle hyphenated github logins, e.g. 'marco-c' (gets split)
  • github lookup should strip https... so can use link from people.m.o
  • does not call out owner status (reports as member)
  • add formatted output summary for copy/paste
  • add short ldap name as an "always check"
  • dpreston, aka fzzy, doesn't have any GitHub perms
  • always check stem when search gives too many (i.e. go for the exact match)