Statistical Analysis for distributions of Alleles SEC population only per gender

Author: Efthymios Tzinis


In [ ]:
# read the data 
agg_alleles_datapath = '../data/genotypes_person.xlsx'
result_filepath = '../results/sec_allels_per_gender'

import pandas as pd
from pprint import pprint
xls = pd.ExcelFile(agg_alleles_datapath)
df = xls.parse(xls.sheet_names[0])
polys =  list(df.columns)[2:]
print polys
pairs_of_allels = [['A','G'],['C','G'],['G','A'],['C','G'],['T','C']]
print pairs_of_allels

In [ ]:
# do the same procedure for each polymorphism
# but first group data per gender
df_grouped = df.groupby(['GENDER'])[polys].sum()
import numpy as np

# gather information about a specific polymorphism for all populations 
alleles_agg_data = {}

for i in range(len(pairs_of_allels)):
    alleles = pairs_of_allels[i]
    poly = polys[i]
    
    data = np.empty((2,2))
    for j in range(len(alleles)):
        data[j][0] = df_grouped[poly].loc['A'].count(alleles[j])
        data[j][1] = df_grouped[poly].loc['F'].count(alleles[j])
    data = data.astype(int)
    
    alleles_agg_data[poly] = {}
    alleles_agg_data[poly]['data'] = data
    alleles_agg_data[poly]['labels'] = alleles

In [ ]:
import math
import scipy.stats as stats

def compute_odds_ratio_CI_95(data,odds_ratio):
    val_in_list = list(data.flatten())
    val_in_list = map(lambda x: 1/float(x),val_in_list)
    sum_of_vals = sum(val_in_list)
    
    error = 1.96 * math.sqrt(sum_of_vals)
    ln_or = math.log(odds_ratio)
    
    uci = math.exp(ln_or + error)
    lci = math.exp(ln_or - error)
    
    return lci, uci
    
def compute_odds_ratio(data):
    if data[0][1] == 0 or data[1][0] == 0:
        return 0
    else:
        return float(data[0][0] * data[1][1]) / (data[0][1] * data[1][0])

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

In [ ]:
import xlwt

def get_stats_from_matrix(data):
    oddsratio, pval_f = stats.fisher_exact(data)
    # print oddsratio, pval_f
    # print compute_odds_ratio(obs)
    lci, uci = compute_odds_ratio_CI_95(data,oddsratio)
    if pval_f < 0.0001:
        new_p_str = '< 0.0001'
    else:
        new_p_str = round(pval_f,4)
    return "{},{},{}-{}".format(new_p_str,round(oddsratio,4),round(lci,4), round(uci,4))

# for each polymorphism just do the same distr. comparisons in between populations 
def compare_between_populations_for_a_polymorphism(data,labels,poly,book):
    
    ws = book.add_sheet(str(poly).split("/")[-1])
    fout = open(result_filepath+'_'+str(poly).split("/")[-1]+'.txt','w')
    header_str = 'Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95'
    fout.write('Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95%\n')
    
    for j in range(len(header_str.split(','))):
        ws.write(0,j,header_str.split(',')[j])
    
    i = 1
    try: 
        matrix = data
    except:
        print "Polymorphism: {} does not contain comparison tuple {}".format(poly,comp_tuple)

    stats = get_stats_from_matrix(data)
    result_str = 'Males'+','+'Females'+','+stats 
    fout.write(result_str+ "\n")
    print result_str
        
    for j in range(len(result_str.split(','))):
        ws.write(i,j,result_str.split(',')[j])

    i += 2
    
    ws.write(i,1,'M')
    ws.write(i,2,'F')
        
    i+=1 
    
    for z in range(2):
        ws.write(i+z,0,labels[z])
        
    for z in range(data.shape[0]):
        for k in range(data.shape[1]):
            ws.write(i+z,1+k,data[z][k])

        
    fout.close()
    return book
        
book = xlwt.Workbook()
for poly,v in alleles_agg_data.items():
    book = compare_between_populations_for_a_polymorphism(v['data'],v['labels'],poly,book)
book.save(result_filepath+'.xls')

In [ ]: