notebook.community

Edit and run



In [22]:

    
import pandas
import math
import parse
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
%matplotlib inline

#import process
%load_ext Cython

#%%cython
########################################################
evidence_file = 'data/evidence.txt'
filtered_evidence_file = 'data/evidence_filtered.txt'
protein_groups_file = 'data/proteinGroups.txt'
phospho_peptide_file = 'data/Phospho (STY)Sites.txt'
our_group_name = 'PYND'
other_group_names = ['APUBSCRAWL', 'ONION', 'ET0H', 'SHMOO', 'WHANGEE'] # Case doesn't matter

experiment_ext_names = ['Ub', 'UbP', 'WCL', 'WCLP']


########################################################









    



The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython



In [23]:

    
# Now apply the filter to evidence.txt to get evidence_filtered.tsv
# This should keep only the lines for our group
parse.filter_tsv(evidence_file, filtered_evidence_file, 'Experiment', '^[Pp][Yy][Nn][Dd]')
print('Done.')









    



Done.



In [24]:

    
evidence = parse.parse_evidence(filtered_evidence_file)

protein_groups = parse.parse_protein_groups(protein_groups_file)
#protein_groups = parse.remove_cols(protein_groups, other_group_names)

phospho = parse.parse_phosphosites(phospho_peptide_file)
#phospho = parse.remove_cols(phospho, other_group_names)
print("Done.")









    



/Users/student/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (421,422) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
/Users/student/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (211,677,678) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)






    



Done.



In [25]:

    
def calc_epsilon(protein_groups):
    mins=[]
    for col in protein_groups.columns:
        for exp_ext in experiment_ext_names:
            if 'Intensity' in col and col.endswith(exp_ext):
                miny=float("inf")
                for x in protein_groups[col]:
                    x = float(x)
                    if x < miny and x != 0.0:
                        miny = x
                if miny != 0.0:
                    mins.append(miny)
    return min(mins)
epsilon = calc_epsilon(protein_groups)
epsilon









    Out[25]:





6193.8



In [26]:

    
# What's the distribution of intensities?
def get_intensity_dists(protein_groups, exp_type, cutoff=0):
    intensities=[]
    experiments=[]
    for col in protein_groups.columns:
        for exp_ext in experiment_ext_names:
            if 'Intensity' in col and col.endswith(exp_ext) and 'KO' in col:
                t = col.split('_')[-1] # to get sorting of keys right
                if t == exp_type:
                    protein_groups = protein_groups[protein_groups[col] > cutoff]
                    protein_groups = protein_groups[protein_groups[col] < float('Inf')]
                    intensities.extend(protein_groups[col])
                    for i in protein_groups[col]:
                        experiments.append(col.split(' ')[1])
    return pandas.DataFrame({"intensity": intensities, "experiment": experiments})

get_intensity_dists(protein_groups.copy(), 'WCL').head()









    Out[26]:






  
    
      
      experiment
      intensity
    
  
  
    
      0
      Apubscrawl_Swe1KO_WCL
      35874000
    
    
      1
      Apubscrawl_Swe1KO_WCL
      6339500
    
    
      2
      Apubscrawl_Swe1KO_WCL
      132750000
    
    
      3
      Apubscrawl_Swe1KO_WCL
      2717800
    
    
      4
      Apubscrawl_Swe1KO_WCL
      4767800



In [27]:

    
from scipy import stats
sns.set(rc={"figure.figsize": (14, 4)})
def plot_for_exp(exp_type):
    intensities = get_intensity_dists(protein_groups.copy(), exp_type)
    #sns.distplot(intensities['intensity'], kde=False, rug=False, label=exp_type, bins = 40);
    ax = sns.boxplot(x='experiment', y='intensity', data=intensities);
    #sns.axlabel("intensity", "frequency")
    plt.title("Intensities for %s experiments" % exp_type)
    #plt.legend(loc='upper left')
    plt.show()

plot_for_exp('WCL')
plot_for_exp('WCLP')
plot_for_exp('Ub')
plot_for_exp('UbP')



In [112]:

    
def filter_bad_rows(df):

    def filter_out(prefix):
        return df[df.apply(lambda row: not any([prefix in name for name in row['Protein IDs']]), axis=1)]

    df = filter_out('CON_')
    df = filter_out('REV_')
    return df

def logspace(pg, cols, strict_cutoff=0):
    for col in cols:
        pg = pg[pg[col].map(lambda x: abs(x) > strict_cutoff)]
        pg = pg[pg[col] < float('Inf')]
        pg[col] = pg[col].map(lambda x: math.log(x, 2) if x > epsilon else epsilon)
    return pg

def normalize(pg, top_col, bottom_col, cutoff=10**5):
    pg = pg[pg[top_col] > math.log(cutoff, 2)]
    pg = pg[pg[bottom_col] > math.log(cutoff, 2)]
    pg['diff'] = pg[top_col] - pg[bottom_col]
    return pg

def spaceify(top_col, bottom_col, cutoff=10**5):
    pg = protein_groups.copy()
    pg = filter_bad_rows(pg)
    pg = logspace(pg, [top_col, bottom_col])
    pg = normalize(pg, top_col, bottom_col, cutoff)
    return pg[['Protein IDs', 'Fasta headers', 'diff', top_col, bottom_col]]

def get_top(pg, thresh=2):
    top = pg.sort('diff', 0, False)
    top = top[top['diff'] > math.log(thresh, 2)]
    return top # .set_index('Protein IDs')
    
def get_bottom(pg, thresh=1/2):
    bottom = pg.sort('diff', 0, True)
    bottom = bottom[bottom['diff'] < math.log(thresh, 2)]
    return bottom # .set_index('Protein IDs')

print("Done.")









    



Done.



In [113]:

    
ko = spaceify('Intensity Pynd_AlkKO_WCL', 'Intensity Control_WCL')
wclp = spaceify('Intensity Pynd_AlkKO_WCLP', 'Intensity Control_WCLP')
ubp = spaceify('Intensity Pynd_AlkKO_UbP', 'Intensity Control_UbP')
fc = spaceify('Intensity Pynd_5FC_Ub', 'Intensity Control_Ub')
print("Done.")









    



Done.



In [114]:

    
sns.set(rc={"figure.figsize": (12, 8)})
sns.set(font_scale=2)
def analyze(pg, top_col, bottom_col, cutoff=10**5):
    t = get_top(pg)
    b = get_bottom(pg)
    g = sns.lmplot(x=bottom_col, y=top_col, data=pg, size=10); # , y_jitter=.03
    plt.savefig("results/" + top_col + ".png")
    #g.set(xlim=(math.log(epsilon, 2), ))
    #g.set(ylim=(math.log(epsilon, 2), ))
    t.to_csv('results/top_' + top_col + '.csv')
    b.to_csv('results/bottom_' + top_col + '.csv')
    return t, b
    

top_ko, bottom_ko = analyze(ko, 'Intensity Pynd_AlkKO_WCL', 'Intensity Control_WCL')

top_wclp, bottom_wclp = analyze(wclp, 'Intensity Pynd_AlkKO_WCLP', 'Intensity Control_WCLP')

top_ubp, bottom_ubp = analyze(ubp, 'Intensity Pynd_AlkKO_UbP', 'Intensity Control_UbP')

top_5fc, bottom_5fc = analyze(fc, 'Intensity Pynd_5FC_Ub', 'Intensity Control_Ub')

exp_diffs = [ko, wclp, ubp, fc]









    



/Users/student/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [30]:

    
#import wget
#wget.download('http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology.1_2.obo')
#print("Done.")



In [32]:

    
# Let's try using these instead:
# http://www.yeastgenome.org/download-data/curation

id_to_go = {}
for line in open('data/gene_association.sgd', 'r'):
    atoms = line.split()
    if atoms[0] == 'SGD':
        # 1 is the weird yeast ID (SGID), and 6 is the relation:
        # f = function
        # c = compartment
        # p = process
        id_to_go[(atoms[1], atoms[6])] = atoms[3]
len(id_to_go)









    Out[32]:





64240



In [116]:

    
import re
pattern = re.compile('SGDID:(S\\d+)')
def get_sgids(lst):
    sgids = []
    for s in lst:
        match = pattern.search(s)
        if match is not None:
            sgids.append(match.group(1))
    return sgids

assert ['S000001611'] == get_sgids(['[>YKL128C PMU1 SGDID:S000001611, Chr XI'])

def get_go_terms(row, rel):
    terms = []
    for sgid in get_sgids(row['Fasta headers']):
        if (sgid, rel) in id_to_go:
            terms.append(id_to_go[(sgid, rel)])
    return terms

print("Done.")









    



Done.



In [34]:

    
from goatools import obo_parser
obo = obo_parser.GODag('gene_ontology.1_2.obo')
print("Done.")









    



load obo file gene_ontology.1_2.obo
45670





    



Done.






    



 nodes imported



In [117]:

    
import random
def add_go_descs(row, rel_long, rel_short, depth=None):
    lst = []
    if depth == None:
        if rel_short == 'C':
            depth = 3
        if rel_short == 'F':
            depth = 2
        if rel_short == 'P':
            depth = 2
    for term in row['GO ' + rel_long]:
        if term in obo:
            o = obo[term]
            while o.depth > depth:
                o = random.choice(o.parents)
            lst.append(o.name)
    return lst

def dedup(df):
    for col in df.columns:
        df[col] = df[col].map(lambda x: (x[0] if len(x) > 0 else None) if type(x) is list else x)
    return df

def show_go(exp_diff):
    exp_diff['GO component'] = exp_diff.apply(lambda row: get_go_terms(row, 'C'), axis=1)
    exp_diff['GO function'] = exp_diff.apply(lambda row: get_go_terms(row, 'F'), axis=1)
    exp_diff['GO process'] = exp_diff.apply(lambda row: get_go_terms(row, 'P'), axis=1)
    #filtered = exp_diff.copy()
    #filtered[filtered['GO'].apply(lambda terms: sum(1 for _ in filter(None, terms)) > 0)]
    exp_diff['GO component description'] = exp_diff.apply(lambda row: add_go_descs(row, 'component', 'C'), axis=1)
    exp_diff['GO function description'] = exp_diff.apply(lambda row: add_go_descs(row, 'function', 'F'), axis=1)
    exp_diff['GO process description'] = exp_diff.apply(lambda row: add_go_descs(row, 'process', 'P'), axis=1)
    return dedup(exp_diff)
    
print("Done.")









    



Done.



In [ ]:



In [118]:

    
ko = show_go(ko)
wclp = show_go(wclp)
ubp = show_go(ubp)
fc = show_go(fc)
ko.head()









    Out[118]:






  
    
      
      Protein IDs
      Fasta headers
      diff
      Intensity Pynd_AlkKO_WCL
      Intensity Control_WCL
      GO component
      GO function
      GO process
      GO component description
      GO function description
      GO process description
    
  
  
    
      132
      YAL003W
      >YAL003W EFB1 SGDID:S000000003, Chr I from 142...
      -0.643788
      30.909421
      31.553208
      GO:0005853
      GO:0005085
      GO:1990145
      protein complex
      guanyl-nucleotide exchange factor activity
      organic substance metabolic process
    
    
      134
      YAL007C
      >YAL007C ERP2 SGDID:S000000005, Chr I from 138...
      -1.461469
      22.780367
      24.241836
      GO:0030134
      GO:0003674
      GO:0006888
      membrane-bounded vesicle
      molecular_function
      establishment of localization
    
    
      136
      YAL012W
      >YAL012W CYS3 SGDID:S000000010, Chr I from 130...
      -0.520081
      28.201770
      28.721852
      GO:0005737
      GO:0004123
      GO:0019344
      cytoplasm
      lyase activity
      cellular metabolic process
    
    
      137
      YAL016W
      >YAL016W TPD3 SGDID:S000000014, Chr I from 124...
      -1.952326
      20.614951
      22.567277
      GO:0005934
      None
      GO:0007094
      cellular bud tip
      None
      regulation of biological process
    
    
      139
      YAL019W
      >YAL019W FUN30 SGDID:S000000017, Chr I from 11...
      0.848908
      20.515184
      19.666276
      GO:0031934
      GO:0008094
      GO:0000729
      macromolecular complex
      hydrolase activity
      primary metabolic process



In [123]:

    
def plot_gos(df, go_type, fig_height):
    sns.set(rc={"figure.figsize": (12, fig_height)})
    sns.axlabel("log2([KO]/[control])", go_type)
    #sns.stripplot(y=go_type, x='diff', data=df)
    sns.boxplot(y=go_type, x='diff', data=df)
    #sns.set(font_scale=2)
    #sns.plt.xlim(min(df[time]),)
    plt.show()
    
plot_gos(ko, 'GO process description', 10)
plot_gos(ko, 'GO component description', 18)
plot_gos(ko, 'GO function description', 12)



In [119]:

    
def denormalize(df):
    rows_list = []
    for index, row in df.iterrows():
        for i in range(0, len(row[0])):
            dct = {}
            for j in range(0, len(df.columns)):
                #if not dct[df.columns[j]].startswith('GO'):
                dct[df.columns[j]] = row[j]
            print(row)
            dct['Protein ID'] = row['Protein IDs'][i]
            dct['Fasta header'] = row['Fasta headers'][i]
            dct['GO components'] = row['GO components'][i]
            dct['GO functions'] = row['GO functions'][i]
            dct['GO processes'] = row['GO processes'][i]
            #dct['GO component descriptions'] = row[8][i]
            #dct['GO function descriptions'] = row[9][i]
            #dct['GO process descriptions'] = row[10][i]
            #del dct['Fasta headers']
            #del dct['Protein IDs']
            rows_list.append(dct)
        #for i in range(0, len(row)):
        #    dct[df.columns[i]] = row[i]
        #for i in range(0, len(dict['Protein IDs'])):
        #    print("HI")
        #for i in range(0, len(row['Protein IDs'])):
    return pandas.DataFrame(rows_list)
        
#denormalize(ko)
#ko.columns



In [120]:

    
top_ko['Fasta headers']









    Out[120]:





1416    [>YIL022W TIM44 SGDID:S000001284, Chr IX from ...
915     [>YER088C DOT6 SGDID:S000000890, Chr V from 33...
710     [>YDR296W MHR1 SGDID:S000002704, Chr IV from 1...
1737    [>YKL128C PMU1 SGDID:S000001611, Chr XI from 2...
1672    [>YJR148W BAT2 SGDID:S000003909, Chr X from 70...
2376    [>YNL147W LSM7 SGDID:S000005091, Chr XIV from ...
1946    [>YLR195C NMT1 SGDID:S000004185, Chr XII from ...
2142    [>YML123C PHO84 SGDID:S000004592, Chr XIII fro...
1824    [>YKR084C HBS1 SGDID:S000001792, Chr XI from 5...
1507    [>YJL005W CYR1 SGDID:S000003542, Chr X from 42...
560     [>YDL223C HBT1 SGDID:S000002382, Chr IV from 6...
2900    [>YPR024W YME1 SGDID:S000006228, Chr XVI from ...
2185    [>YMR086W SEG1 SGDID:S000004692, Chr XIII from...
1830    [>YKR095W MLP1 SGDID:S000001803, Chr XI from 6...
1710    [>YKL077W YKL077W SGDID:S000001560, Chr XI fro...
161     [>YAL060W BDH1 SGDID:S000000056, Chr I from 35...
724     [>YDR326C YSP2 SGDID:S000002734, Chr IV from 1...
2406    [>YNL207W RIO2 SGDID:S000005151, Chr XIV from ...
1130    [>YGL255W ZRT1 SGDID:S000003224, Chr VII from ...
2886    [>YPL271W ATP15 SGDID:S000006192, Chr XVI from...
2681    [>YOR208W PTP2 SGDID:S000005734, Chr XV from 7...
812     [>YDR508C GNP1 SGDID:S000002916, Chr IV from 1...
1243    [>YGR229C SMI1 SGDID:S000003461, Chr VII from ...
2457    [>YNL307C MCK1 SGDID:S000005251, Chr XIV from ...
2950    [>YPR148C YPR148C SGDID:S000006352, Chr XVI fr...
1357    [>YHR098C SFB3 SGDID:S000001140, Chr VIII from...
2688    [>YOR220W RCN2 SGDID:S000005746, Chr XV from 7...
1651    [>YJR103W URA8 SGDID:S000003864, Chr X from 62...
405     [>YCL064C CHA1 SGDID:S000000569, Chr III from ...
2462    [>YNL312W RFA2 SGDID:S000005256, Chr XIV from ...
581     [>YDR028C REG1 SGDID:S000002435, Chr IV from 5...
2340    [>YNL085W MKT1 SGDID:S000005029, Chr XIV from ...
1537    [>YJL068C YJL068C SGDID:S000003604, Chr X from...
1303    [>YHR009C TDA3 SGDID:S000001051, Chr VIII from...
292     [>YBR111C YSA1 SGDID:S000000315, Chr II from 4...
2397    [>YNL183C NPR1 SGDID:S000005127, Chr XIV from ...
2249    [>YMR220W ERG8 SGDID:S000004833, Chr XIII from...
1110    [>YGL197W MDS3 SGDID:S000003165, Chr VII from ...
1030    [>YGL001C ERG26 SGDID:S000002969, Chr VII from...
2515    [>YOL031C SIL1 SGDID:S000005391, Chr XV from 2...
1727    [>YKL108W SLD2 SGDID:S000001591, Chr XI from 2...
2032    [>YLR370C ARC18 SGDID:S000004362, Chr XII from...
Name: Fasta headers, dtype: object



In [122]:

    
# GO enrichment
top_top_ko = get_top(top_ko, thresh=2)
with open('results/ko_top_sgdids.txt', 'w') as f:
    for k in top_top_ko['Fasta headers']:
        f.write("%s\n" % get_sgids(k)[0])
        
bottom_bottom_ko = get_bottom(bottom_ko, thresh=1/2)
with open('results/ko_bottom_sgdids.txt', 'w') as f:
    for k in bottom_bottom_ko['Fasta headers']:
        f.write("%s\n" % get_sgids(k)[0])
        
with open('results/ko_sgdids.txt', 'w') as f:
    for k in ko['Fasta headers']:
        f.write("%s\n" % get_sgids([k])[0])
        
print("Done.")









    



Done.



In [ ]:



In [ ]:

	experiment	intensity
0	Apubscrawl_Swe1KO_WCL	35874000
1	Apubscrawl_Swe1KO_WCL	6339500
2	Apubscrawl_Swe1KO_WCL	132750000
3	Apubscrawl_Swe1KO_WCL	2717800
4	Apubscrawl_Swe1KO_WCL	4767800

	Protein IDs	Fasta headers	diff	Intensity Pynd_AlkKO_WCL	Intensity Control_WCL	GO component	GO function	GO process	GO component description	GO function description	GO process description
132	YAL003W	>YAL003W EFB1 SGDID:S000000003, Chr I from 142...	-0.643788	30.909421	31.553208	GO:0005853	GO:0005085	GO:1990145	protein complex	guanyl-nucleotide exchange factor activity	organic substance metabolic process
134	YAL007C	>YAL007C ERP2 SGDID:S000000005, Chr I from 138...	-1.461469	22.780367	24.241836	GO:0030134	GO:0003674	GO:0006888	membrane-bounded vesicle	molecular_function	establishment of localization
136	YAL012W	>YAL012W CYS3 SGDID:S000000010, Chr I from 130...	-0.520081	28.201770	28.721852	GO:0005737	GO:0004123	GO:0019344	cytoplasm	lyase activity	cellular metabolic process
137	YAL016W	>YAL016W TPD3 SGDID:S000000014, Chr I from 124...	-1.952326	20.614951	22.567277	GO:0005934	None	GO:0007094	cellular bud tip	None	regulation of biological process
139	YAL019W	>YAL019W FUN30 SGDID:S000000017, Chr I from 11...	0.848908	20.515184	19.666276	GO:0031934	GO:0008094	GO:0000729	macromolecular complex	hydrolase activity	primary metabolic process