Filtering MapMan bincodes with Python and Pandas


In [1]:
import pandas
import networkx as nx
from networkx import DiGraph

In [19]:
!tar xvfJ mapman_cleaned_output_new.tar.xz
mapman = pandas.read_csv('mapman_cleaned_output_new.txt', sep='\t', quotechar="'")


mapman_cleaned_output_new.txt

In [20]:
mapman


Out[20]:
BINCODE NAME IDENTIFIER DESCRIPTION TYPE
0 0 control genes NaN NaN NaN
1 1 PS PGSC0003DMP400034000 moderately similar to ( 301) AT5G38660 | Symbo... T
2 1.1 PS.lightreaction NaN NaN NaN
3 1.1.1 PS.lightreaction.photosystem II NaN NaN NaN
4 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023745 moderately similar to ( 457) AT1G29930 | Symbo... T
5 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400007919 moderately similar to ( 275) AT1G45474 | Symbo... T
6 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400022300 moderately similar to ( 359) AT1G15820 | Symbo... T
7 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400017530 very weakly similar to (82.8) AT1G76570 | Symb... T
8 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023811 weakly similar to ( 109) AT1G29930 | Symbols: ... T
9 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400055858 moderately similar to ( 437) AT3G47470 | Symbo... T
10 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023743 moderately similar to ( 440) AT1G29930 | Symbo... T
11 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023740 moderately similar to ( 228) AT3G27690 | Symbo... T
12 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400033446 moderately similar to ( 308) AT5G54270 | Symbo... T
13 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400033906 moderately similar to ( 444) AT3G47470 | Symbo... T
14 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400013707 moderately similar to ( 414) AT1G61520 | Symbo... T
15 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400014554 moderately similar to ( 291) AT3G27690 | Symbo... T
16 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400017529 moderately similar to ( 497) AT1G76570 | Symbo... T
17 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023739 moderately similar to ( 446) AT1G29930 | Symbo... T
18 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400014894 moderately similar to ( 461) AT3G08940 | Symbo... T
19 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400010908 moderately similar to ( 474) AT3G27690 | Symbo... T
20 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023813 moderately similar to ( 291) AT2G34430 | Symbo... T
21 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400029202 moderately similar to ( 459) AT1G29930 | Symbo... T
22 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400014555 moderately similar to ( 444) AT2G34430 | Symbo... T
23 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400005227 moderately similar to ( 397) AT1G19150 | Symbo... T
24 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400015035 moderately similar to ( 460) AT5G54270 | Symbo... T
25 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400014577 moderately similar to ( 447) AT1G29930 | Symbo... T
26 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400015464 moderately similar to ( 427) AT1G29930 | Symbo... T
27 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400014556 moderately similar to ( 438) AT2G34430 | Symbo... T
28 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400033445 moderately similar to ( 467) AT5G54270 | Symbo... T
29 1.1.1.1 PS.lightreaction.photosystem II.LHC-II PGSC0003DMP400023744 moderately similar to ( 421) AT1G29930 | Symbo... T
... ... ... ... ... ...
58622 35.2 not assigned.unknown PGSC0003DMP400068998 PGSC0003DMT400097323 T
58623 35.2 not assigned.unknown PGSC0003DMP400009218 PGSC0003DMT400013300 T
58624 35.2 not assigned.unknown PGSC0003DMP400068138 PGSC0003DMT400096463 T
58625 35.2 not assigned.unknown PGSC0003DMP400032606 PGSC0003DMT400048142 T
58626 35.2 not assigned.unknown PGSC0003DMP400039207 moderately similar to ( 327) AT1G59710 | Symbo... T
58627 35.2 not assigned.unknown PGSC0003DMP400021885 PGSC0003DMT400032213 T
58628 35.2 not assigned.unknown PGSC0003DMP400054941 weakly similar to ( 179) AT3G07530 | Symbols: ... T
58629 35.2 not assigned.unknown PGSC0003DMP400031883 PGSC0003DMT400047139 T
58630 35.2 not assigned.unknown PGSC0003DMP400039276 weakly similar to ( 106) loc_os05g30920 12005.... T
58631 35.2 not assigned.unknown PGSC0003DMP400012444 PGSC0003DMT400018078 T
58632 35.2 not assigned.unknown PGSC0003DMP400029103 moderately similar to ( 206) AT5G05240 | Symbo... T
58633 35.2 not assigned.unknown PGSC0003DMP400057913 PGSC0003DMT400086238 T
58634 35.2 not assigned.unknown PGSC0003DMP400058780 PGSC0003DMT400087105 T
58635 35.2 not assigned.unknown PGSC0003DMP400068370 PGSC0003DMT400096695 T
58636 35.2 not assigned.unknown PGSC0003DMP400019868 PGSC0003DMT400029196 T
58637 35.2 not assigned.unknown PGSC0003DMP400037261 PGSC0003DMT400055366 T
58638 35.2 not assigned.unknown PGSC0003DMP400061181 PGSC0003DMT400089506 T
58639 35.2 not assigned.unknown PGSC0003DMP400058564 PGSC0003DMT400086889 T
58640 35.2 not assigned.unknown PGSC0003DMP400041067 highly similar to ( 568) AT2G41640 | Symbols: ... T
58641 35.2 not assigned.unknown PGSC0003DMP400058838 PGSC0003DMT400087163 T
58642 35.2 not assigned.unknown PGSC0003DMP400053281 moderately similar to ( 341) AT1G66680 | Symbo... T
58643 35.2 not assigned.unknown PGSC0003DMP400064582 PGSC0003DMT400092907 T
58644 35.2 not assigned.unknown PGSC0003DMP400047950 PGSC0003DMT400070905 T
58645 35.2 not assigned.unknown PGSC0003DMP400023596 PGSC0003DMT400034695 T
58646 35.2 not assigned.unknown PGSC0003DMP400035918 weakly similar to ( 191) AT5G20120 | Symbols: ... T
58647 35.2.1001 not assigned.unknown NaN NaN NaN
58648 35.3 not assigned.disagreeing hits NaN NaN NaN
58649 991 Mineral Nutrition NaN NaN NaN
58650 991.1 Mineral Nutrition.phosphatie NaN NaN NaN
58651 991.1.1001 Mineral Nutrition.phosphate NaN NaN NaN

58652 rows × 5 columns


In [4]:
mapman.BINCODE


Out[4]:
0                 0
1                 1
2               1.1
3             1.1.1
4           1.1.1.1
5           1.1.1.1
6           1.1.1.1
7           1.1.1.1
8           1.1.1.1
9           1.1.1.1
10          1.1.1.1
11          1.1.1.1
12          1.1.1.1
13          1.1.1.1
14          1.1.1.1
15          1.1.1.1
16          1.1.1.1
17          1.1.1.1
18          1.1.1.1
19          1.1.1.1
20          1.1.1.1
21          1.1.1.1
22          1.1.1.1
23          1.1.1.1
24          1.1.1.1
25          1.1.1.1
26          1.1.1.1
27          1.1.1.1
28          1.1.1.1
29          1.1.1.1
            ...    
58622          35.2
58623          35.2
58624          35.2
58625          35.2
58626          35.2
58627          35.2
58628          35.2
58629          35.2
58630          35.2
58631          35.2
58632          35.2
58633          35.2
58634          35.2
58635          35.2
58636          35.2
58637          35.2
58638          35.2
58639          35.2
58640          35.2
58641          35.2
58642          35.2
58643          35.2
58644          35.2
58645          35.2
58646          35.2
58647     35.2.1001
58648          35.3
58649           991
58650         991.1
58651    991.1.1001
Name: BINCODE, dtype: object

In [5]:
def mapman2tree(mapman_table):
    mapman_graph = DiGraph()
    mapman_graph.root = 'root'
    mapman_graph.add_node(mapman_graph.root)
    for bincode in mapman_table['BINCODE']:
        bincode_tuple = bincode.split('.')
        for i, level in enumerate(bincode_tuple, 1):
            parent_level_id = '.'.join(bincode_tuple[:i-1])
            level_id = '.'.join(bincode_tuple[:i])
            mapman_graph.add_node(level_id)
            if not parent_level_id:
                    parent_level_id = mapman_graph.root
            mapman_graph.add_edge(parent_level_id, level_id)
    return mapman_graph

In [6]:
mapman_graph = mapman2tree(mapman)

In [7]:
print nx.info(mapman_graph)


Name: 
Type: DiGraph
Number of nodes: 2264
Number of edges: 2263
Average in degree:   0.9996
Average out degree:   0.9996

In [8]:
mapman_graph.successors('1')


Out[8]:
['1.4', '1.5', '1.1', '1.2', '1.3']

In [9]:
def get_indices_in_bin(mapman_graph, bin_id=None):
    """
    returns a set of all (sub)-bin IDs in the given bin ID hierarchy,
    e.g. for bin_id '1.1', this will return
    {'1.1', ... '1.1.2', ... '1.1.4.3.43'}.
    
    If no bin ID is given, this will return all mapman bin IDs.
    """
    indices = set()
    if bin_id is None:
        bin_id = mapman_graph.root
    indices.add(bin_id)

    successor_dict = nx.dfs_successors(mapman_graph, bin_id)
    for successor in successor_dict:
        indices.update(successor_dict[successor])
    return indices

In [10]:
len(get_indices_in_bin(mapman_graph))


Out[10]:
2264

In [11]:
get_indices_in_bin(mapman_graph, '1.2')


Out[11]:
{'1.2',
 '1.2.1',
 '1.2.1001',
 '1.2.1002',
 '1.2.1003',
 '1.2.1004',
 '1.2.1005',
 '1.2.1006',
 '1.2.2',
 '1.2.3',
 '1.2.4',
 '1.2.4.1',
 '1.2.4.2',
 '1.2.4.3',
 '1.2.4.4',
 '1.2.5',
 '1.2.6',
 '1.2.7'}

In [12]:
mapman[mapman.BINCODE.isin(get_indices_in_bin(mapman_graph, '1.1.4'))]


Out[12]:
BINCODE NAME IDENTIFIER DESCRIPTION TYPE
151 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400024440 weakly similar to ( 171) AT2G07698 | Symbols: ... T
152 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400035579 weakly similar to ( 145) AT4G32260 | Symbols: ... T
153 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400013846 weakly similar to ( 155) AT2G31040 | Symbols: ... T
154 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400013845 moderately similar to ( 414) AT2G31040 | Symbo... T
155 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400035510 moderately similar to ( 326) AT2G31040 | Symbo... T
156 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400038511 moderately similar to ( 423) AT2G07698 | Symbo... T
157 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400014872 weakly similar to ( 135) ATCG00120 | Symbols: ... T
158 1.1.4 PS.lightreaction.ATP synthase PGSC0003DMP400035552 weakly similar to ( 145) AT4G32260 | Symbols: ... T
159 1.1.4.1 PS.lightreaction.ATP synthase.alpha subunit NaN NaN NaN
160 1.1.4.2 PS.lightreaction.ATP synthase.beta subunit PGSC0003DMP400056384 very weakly similar to (99.8) ATCG00480 | Symb... T
161 1.1.4.3 PS.lightreaction.ATP synthase.epsilon chain PGSC0003DMP400043561 moderately similar to ( 205) ATCG00470 | Symbo... T
162 1.1.4.4 PS.lightreaction.ATP synthase.gamma chain PGSC0003DMP400028825 highly similar to ( 535) AT4G04640 | Symbols: ... T
163 1.1.4.4 PS.lightreaction.ATP synthase.gamma chain PGSC0003DMP400048475 moderately similar to ( 450) AT4G04640 | Symbo... T
164 1.1.4.5 PS.lightreaction.ATP synthase.subunit C NaN NaN NaN
165 1.1.4.6 PS.lightreaction.ATP synthase.chloroplastic su... NaN NaN NaN
166 1.1.4.7 PS.lightreaction.ATP synthase.delta chain PGSC0003DMP400029632 moderately similar to ( 224) AT4G09650 | Symbo... T
167 1.1.4.7 PS.lightreaction.ATP synthase.delta chain PGSC0003DMP400012415 very weakly similar to (88.2) AT4G09650 | Symb... T
168 1.1.4.8 PS.lightreaction.ATP synthase.subunit B (ATPF) NaN NaN NaN
169 1.1.4.9 PS.lightreaction.ATP synthase.subunit B_ (ATPX) NaN NaN NaN

count elements in all bins


In [13]:
import re

INTEGER_RE = re.compile('([0-9]+)')

# this is the sort order understood by most humans
def natural_sort_key(s):
    """
    returns a key that can be used in sort functions.

    Example:

    >>> items = ['A99', 'a1', 'a2', 'a10', 'a24', 'a12', 'a100']

    The normal sort function will ignore the natural order of the
    integers in the string:

    >>> print sorted(items)
    ['A99', 'a1', 'a10', 'a100', 'a12', 'a2', 'a24']

    When we use this function as a key to the sort function,
    the natural order of the integer is considered.

    >>> print sorted(items, key=natural_sort_key)
    ['A99', 'a1', 'a2', 'a10', 'a12', 'a24', 'a100']
    """
    return [int(text) if text.isdigit() else text
            for text in re.split(INTEGER_RE, str(s))]

In [14]:
with open('mapman_bin_counts.tsv', 'w') as counts_file:
    for bin_id in sorted(get_indices_in_bin(mapman_graph), key=natural_sort_key):
        counts = len(mapman[mapman.BINCODE.isin(get_indices_in_bin(mapman_graph, bin_id))])
        counts_file.write("{}\t{}\n".format(bin_id, counts))

In [15]:
!head 'mapman_bin_counts.tsv'


0	1
1	428
1.1	263
1.1.1	109
1.1.1.1	45
1.1.1.2	61
1.1.1.3	1
1.1.1.4	1
1.1.2	30
1.1.2.1	6

improved output format

  • add header
  • don't count rows with NaN identifier
  • counts in specific bin vs. recursive counts (in bin and its sub-bins)

In [22]:
with open('mapman_bin_counts_without_nan.tsv', 'w') as counts_file:
    counts_file.write("BIN-ID\tCOUNTS\tRECURSIVE-COUNTS\n")
    for bin_id in sorted(get_indices_in_bin(mapman_graph), key=natural_sort_key):
        nan_identifier_rows = mapman.IDENTIFIER.isnull()
        
        rows_in_bin = mapman.BINCODE == bin_id
        rows_in_bin_recursive = mapman.BINCODE.isin(get_indices_in_bin(mapman_graph, bin_id))
        
        counts = len(mapman[rows_in_bin & -nan_identifier_rows])
        recursive_counts = len(mapman[rows_in_bin_recursive & -nan_identifier_rows])
        counts_file.write("{}\t{}\t{}\n".format(bin_id, counts, recursive_counts))

In [23]:
!head 'mapman_bin_counts_without_nan.tsv'


BIN-ID	COUNTS	RECURSIVE-COUNTS
0	0	0
1	1	382
1.1	0	240
1.1.1	0	106
1.1.1.1	45	45
1.1.1.2	61	61
1.1.1.3	0	0
1.1.1.4	0	0
1.1.2	3	29