Genotypes and phenotypes

Genotypes are biallelic in the form of (0, 1) where 0 is a recessive allele and 1 is a dominant allele. Each genotype is associated with a character of the organism (e.g., seed color) and recessive and dominant phenotypes (e.g., "green" and "yellow").


In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import random

In [4]:
def create_parental_population(n, allele):
    return np.array([allele] * 2*n).reshape(n, 2)

In [5]:
create_parental_population(10, 0)


Out[5]:
array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0]])

In [30]:
n = 1000

In [31]:
p1_dominant = create_parental_population(n, 1)

In [32]:
p1_recessive = create_parental_population(n, 0)

In [13]:
def get_alleles(population):
    return np.apply_along_axis(np.random.choice, 1, population)

def cross_populations(population1, population2):
    return np.array([get_alleles(population1), get_alleles(population2)]).T

In [33]:
f1 = cross_populations(p1_dominant, p1_recessive)

In [34]:
f2 = cross_populations(f1, f1)

In [35]:
f2[:10]


Out[35]:
array([[1, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1]])

In [38]:
def get_allele_frequencies(population):
    """
    Return the frequency of each allele in the given population.
    """
    unique, inverse = np.unique(population, return_inverse=True)
    count = np.zeros(len(unique), np.int)
    np.add.at(count, inverse, 1)
    return count / np.float(count.sum())

def get_expected_genotype_frequencies(population):
    """
    Return an array of genotype frequencies expected for the given population based on Hardy-Weinberg equilibrium.
    """
    allele_frequencies = get_allele_frequencies(population)
    return np.array([
        allele_frequencies[0] ** 2,
        2 * np.product(allele_frequencies),
        allele_frequencies[1] ** 2
    ])

def get_observed_genotype_frequencies(population):
    """
    Return the observed genotype frequencies of the given population.
    """
    genotypes = np.apply_along_axis(sum, 1, population)
    return np.bincount(genotypes) / np.sum(genotypes, dtype=np.float64)

In [22]:
f2_allele_frequencies = get_allele_frequencies(f2)

In [36]:
np.sum(get_expected_genotype_frequencies(f2))


Out[36]:
0.99999999999999978

In [39]:
np.sum(get_observed_genotype_frequencies(f2))


Out[39]:
1.0172939979654121

In [189]:
f3 = cross_populations(f2, f2)

In [190]:
get_genotype_frequencies(f3)


Out[190]:
array([ 0.37649402,  0.23904382,  0.38047809])