Statistical Analysis for distributions of Alleles cross-Populations

Author: Efthymios Tzinis


In [ ]:
# read the data 
agg_alleles_datapath = '../data/aggregated_alleles_new.xlsx'
result_filepath = '../results/aggregated_allels_comparisons'

import pandas as pd
from pprint import pprint
xls = pd.ExcelFile(agg_alleles_datapath)
df = xls.parse(xls.sheet_names[0])
valid_sheets = xls.sheet_names[:-1]
print valid_sheets

In [ ]:
# gather information about a specific polymorphism for all populations 
alleles_agg_data = {}
for poly in valid_sheets:
    df = xls.parse(poly)
    
    # find indexes for alleles for all populations => pls allign all sheets!!
    alleles_indexes = [6,7]
    alleles_labels = map(lambda x: df['GENOTYPES'].loc[x],alleles_indexes)

#     print "Labels of alleles " + str(alleles_labels)
#     print "Indexes in sheet "+str(alleles_indexes)
    
    alleles_agg_data[poly] = {}
    alleles_agg_data[poly]['data'] = df.loc[alleles_indexes]
    alleles_agg_data[poly]['labels'] = alleles_labels
print alleles_agg_data

In [ ]:
import math
import scipy.stats as stats

def compute_odds_ratio_CI_95(data,odds_ratio):
    val_in_list = list(data.flatten())
    val_in_list = map(lambda x: 1/float(x),val_in_list)
    sum_of_vals = sum(val_in_list)
    
    error = 1.96 * math.sqrt(sum_of_vals)
    ln_or = math.log(odds_ratio)
    
    uci = math.exp(ln_or + error)
    lci = math.exp(ln_or - error)
    
    return lci, uci
    
def compute_odds_ratio(data):
    if data[0][1] == 0 or data[1][0] == 0:
        return 0
    else:
        return float(data[0][0] * data[1][1]) / (data[0][1] * data[1][0])

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

# from scipy.stats import chisquare
# obs = np.array([[44, 468], [23, 477]])
# obs = np.array([[180, 90], [60, 170]])
# obs = np.array([[668, 338], [1585, 679]]).T
# # obs = np.array([[1089, 3919], [679, 1585]])
# # obs = np.array([[977, 29], [2069, 483]])
# # obs = np.array([[4214, 794], [2069, 483]])
# # chisq, pval_chi = chisquare(obs,ddof=0)
# # print chisq, pval_chi
# # chisq, pval_chi= sum(chisq) , sum(pval_chi)
# # print chisq, pval_chi


# oddsratio, pval_f = stats.fisher_exact(obs)
# # print oddsratio, pval_f
# # print compute_odds_ratio(obs)
# lci, uci = compute_odds_ratio_CI_95(obs,oddsratio)
# print "{},{},{}-{}".format(pval_f,oddsratio,lci, uci)

In [ ]:
import xlwt

def get_stats_from_matrix(data):
    oddsratio, pval_f = stats.fisher_exact(data)
    # print oddsratio, pval_f
    # print compute_odds_ratio(obs)
    lci, uci = compute_odds_ratio_CI_95(data,oddsratio)
    if pval_f < 0.0001:
        new_p_str = '< 0.0001'
    else:
        new_p_str = round(pval_f,4)
    return "{},{},{}-{}".format(new_p_str,round(oddsratio,4),round(lci,4), round(uci,4))

# for each polymorphism just do the same distr. comparisons in between populations 
def compare_between_populations_for_a_polymorphism(data,labels,poly,book):
    # All desired comparisons for producing stats
    comparison_list = [['GREEKS','AFRICANS'],
                    ['SEC','AMERICANS'],
                    ['SEC','EAST ASIANS'],
                    ['SEC','SOUTH ASIANS'],
                    ['SEC','EUROPEANS'],
                    ['SEC','GROUP1=CEU+FIN+GBR'],
                    ['GROUP1=CEU+FIN+GBR','GROUP2=IBS+TSI+SEC'],
                    ['SEC','GROUP3=IBS+TSI'],
                    ['SEC','ALL']]
    
    ws = book.add_sheet(str(poly))
    fout = open(result_filepath+'_'+str(poly)+'.txt','w')
    header_str = 'Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95'
    fout.write('Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95%\n')
    
    for j in range(len(header_str.split(','))):
        ws.write(0,j,header_str.split(',')[j])
    
    i = 1
    for comp_tuple in comparison_list:
        try: 
#             print data
            matrix = data[comp_tuple].as_matrix()
        except:
            print "Polymorphism: {} does not contain comparison tuple {}".format(poly,comp_tuple)
            
        stats = get_stats_from_matrix(matrix)
        result_str = str(comp_tuple[0])+','+comp_tuple[1]+','+stats 
        fout.write(result_str+ "\n")
        print result_str
        print matrix
        
        for j in range(len(result_str.split(','))):
            ws.write(i,j,result_str.split(',')[j])
        
        i += 1
        
    fout.close()
    return book
        
book = xlwt.Workbook()
for poly,v in alleles_agg_data.items():
    book = compare_between_populations_for_a_polymorphism(v['data'],v['labels'],poly,book)
book.save(result_filepath+'.xls')

In [ ]:
import numpy as np
obs = np.array([[1930, 405], [112, 23]])
print stats.fisher_exact(obs)

In [ ]: