In [1]:
from lxml import etree
import pandas as pd

In [2]:
tree = etree.parse('biosample_result.xml')

In [3]:
# dict of all XPATHs we're interested in, relative to a given <BioSample>
BIOSAMPLE_XPATHS = {
    "biosample_id": ".//Id[@db='BioSample']", 
    "sra_id": ".//Id[@db='SRA']", 
    "title": ".//Title", 
    "paragraph": ".//Description/Comment/Paragraph",
    "tissue": ".//Attribute[@attribute_name='tissue']",
    "age": ".//Attribute[@attribute_name='age']",
    "cultivar": ".//Attribute[@attribute_name='cultivar']",
    "maize_cultivar": ".//Attribute[@attribute_name='maize cultivar']",
    "genotype": ".//Attribute[@attribute_name='genotype']",
    "ecotype": ".//Attribute[@attribute_name='ecotype']",
    "isolate": ".//Attribute[@attribute_name='isolate']",
    "dev_stage": ".//Attribute[@attribute_name='dev_stage']",
    "leaf_number": ".//Attribute[@attribute_name='leaf number']",
    "source_name": ".//Attribute[@attribute_name='source name']",
    "label": ".//Attribute[@attribute_name='label']"
}

In [4]:
def extract_biosample_values(biosample, xpaths):
    """Takes a <BioSample> element and a dict that maps
    from attribute names to XPATHs.
    Extracts all the text values from the XPATHs.
    Returns a dict mapping from the attribute names to
    attribute values.
    """
    result = {}
    for name, query in xpaths.items():
        xpath_result = biosample.xpath(query)
        if xpath_result:
            # XPATH queries always return a list of results,
            # but we're certain there's only one result.
            result[name] = xpath_result[0].text
        else: # If you remove the "else"-part,
              # the resulting table will contain NaNs instead of empty strings
            result[name] = ""
    return result

In [5]:
extracted_samples = []
for sample in tree.iter('BioSample'):
    extracted_sample = extract_biosample_values(sample, BIOSAMPLE_XPATHS)
    extracted_samples.append(extracted_sample)

In [6]:
samples_table = pd.DataFrame(extracted_samples)

In [7]:
samples_table


Out[7]:
age biosample_id cultivar dev_stage ecotype genotype isolate label leaf_number maize_cultivar paragraph source_name sra_id tissue title
0 9 SAMEA5605513 B73 wild type genotype ERS3409710 E-MTAB-7200_2:Sample 9
1 9 SAMEA5605512 B73 wild type genotype ERS3409709 E-MTAB-7200_2:Sample 8
2 9 SAMEA5605511 B73 wild type genotype ERS3409708 E-MTAB-7200_2:Sample 7
3 9 SAMEA5605510 B73 wild type genotype ERS3409707 E-MTAB-7200_2:Sample 6
4 9 SAMEA5605509 B73 wild type genotype ERS3409706 E-MTAB-7200_2:Sample 5
5 9 SAMEA5605508 B73 wild type genotype ERS3409705 E-MTAB-7200_2:Sample 4
6 9 SAMEA5605507 B73 wild type genotype ERS3409704 E-MTAB-7200_2:Sample 3
7 9 SAMEA5605506 B73 wild type genotype ERS3409703 E-MTAB-7200_2:Sample 2
8 9 SAMEA5605505 B73 wild type genotype ERS3409702 E-MTAB-7200_2:Sample 10
9 9 SAMEA5605504 B73 wild type genotype ERS3409701 E-MTAB-7200_2:Sample 1
10 SAMEA5229560 ERS3037001 B73
11 7 days SAMN03217742 Zea mays B73 In this study, temporal changes of gene and mi... SRS750388 leaf Greening process of maize and rice leaves
12 13 days after sowing SAMN03100283 B73 SRS718868 etiolated tissue Zea mays subsp. mays: B73 DNA
13 SAMN03031762 B73 lg1-R SRS700189 P6 leaf primordia lg1-3
14 SAMN03031761 B73 lg1-R SRS700188 P6 leaf primordia lg1-2
15 SAMN03031760 B73 lg1-R SRS700187 P6 leaf primordia lg1-1
16 SAMN03031759 B73 wild type SRS700186 P6 leaf primordia wtL-3
17 SAMN03031758 B73 wild type SRS700185 P6 leaf primordia wtL-2
18 SAMN03031757 B73 wild type SRS700184 P6 leaf primordia wtL-1
19 2 weeks old SAMN02996429 B73 SRS685657 shoot meristem Plant sample from Zea mays subsp. mays
20 2 weeks old SAMN02996428 B73 SRS685656 Shoot meristem Plant sample from Zea mays subsp. mays
21 2 weeks old SAMN02996427 B73 SRS685655 laser microdissected shoot meristem Plant sample from Zea mays subsp. mays
22 2 weeks old SAMN02996426 B73 SRS685654 Whole seedling Plant sample from Zea mays subsp. mays
23 2 weeks SAMN02996425 B73 SRS685653 Whole seedling Plant sample from Zea mays subsp. mays
24 SAMN02401801 SRS502117 PARE library from Maize B73, pollen
25 SAMN02351542 inbred line Mo17 B73-stele RNA isolated from individual tissues of young ... SRS478255 Zea mays, Mo17 4-day-old primary roots from st...
26 SAMN02203474 SRS444558 Zea Mays, B73 x Mo17 BSA tall
27 SAMN02203473 SRS444557 Zea Mays, B73 x Mo17 BSA short
28 SAMN02203472 SRS444556 Zea Mays, B73 x Mo17 BSA late
29 SAMN02203471 SRS444555 Zea Mays, B73 x Mo17 BSA early
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3858 36 SAMN12138014 B73 ZM2 SRS5023549 tassel primordia Plant sample from Zea mays subsp. mays
3859 36 SAMN12138013 B73 ZM3 SRS5023547 tassel primordia Plant sample from Zea mays subsp. mays
3860 38 SAMN12138012 B73 ZM3 SRS5023546 tassel primordia Plant sample from Zea mays subsp. mays
3861 38 SAMN12138011 B73 ZM3 SRS5023545 tassel primordia Plant sample from Zea mays subsp. mays
3862 38 SAMN12138010 B73 ZM4 SRS5023544 tassel primordia Plant sample from Zea mays subsp. mays
3863 38 SAMN12138009 B73 ZM3 SRS5023543 tassel primordia Plant sample from Zea mays subsp. mays
3864 38 SAMN12138008 B73 ZM3 SRS5023542 tassel primordia Plant sample from Zea mays subsp. mays
3865 38 SAMN12138007 B73 ZM2 SRS5023541 tassel primordia Plant sample from Zea mays subsp. mays
3866 39 SAMN12138006 B73 ZM3 SRS5023539 tassel primordia Plant sample from Zea mays subsp. mays
3867 39 SAMN12138005 B73 ZM4 SRS5023540 tassel primordia Plant sample from Zea mays subsp. mays
3868 39 SAMN12138004 B73 ZM3 SRS5023550 tassel primordia Plant sample from Zea mays subsp. mays
3869 39 SAMN12138003 B73 ZM4 SRS5023551 tassel primordia Plant sample from Zea mays subsp. mays
3870 40 SAMN12138002 B73 ZM4 SRS5023563 tassel primordia Plant sample from Zea mays subsp. mays
3871 40 SAMN12138001 B73 ZM3 SRS5023565 tassel primordia Plant sample from Zea mays subsp. mays
3872 40 SAMN12138000 B73 ZM4 SRS5023562 tassel primordia Plant sample from Zea mays subsp. mays
3873 41 SAMN12137999 B73 ZM5 SRS5023564 tassel primordia Plant sample from Zea mays subsp. mays
3874 41 SAMN12137998 B73 ZM5 SRS5023560 tassel primordia Plant sample from Zea mays subsp. mays
3875 41 SAMN12137997 B73 ZM4 SRS5023561 tassel primordia Plant sample from Zea mays subsp. mays
3876 41 SAMN12137996 B73 ZM4 SRS5023558 tassel primordia Plant sample from Zea mays subsp. mays
3877 42 SAMN12137995 B73 ZM5 SRS5023559 tassel primordia Plant sample from Zea mays subsp. mays
3878 42 SAMN12137994 B73 ZM5 SRS5023576 tassel primordia Plant sample from Zea mays subsp. mays
3879 42 SAMN12137993 B73 ZM5 SRS5023577 tassel primordia Plant sample from Zea mays subsp. mays
3880 42 SAMN12137992 B73 ZM5 SRS5023574 tassel primordia Plant sample from Zea mays subsp. mays
3881 42 SAMN12137991 B73 ZM4 SRS5023575 tassel primordia Plant sample from Zea mays subsp. mays
3882 42 SAMN12137990 B73 ZM4 SRS5023572 tassel primordia Plant sample from Zea mays subsp. mays
3883 42 SAMN12137989 B73 ZM4 SRS5023573 tassel primordia Plant sample from Zea mays subsp. mays
3884 42 SAMN12137988 B73 ZM3 SRS5023570 tassel primordia Plant sample from Zea mays subsp. mays
3885 45 SAMN12137987 B73 ZM5 SRS5023571 tassel primordia Plant sample from Zea mays subsp. mays
3886 45 SAMN12137986 B73 ZM5 SRS5023568 tassel primordia Plant sample from Zea mays subsp. mays
3887 45 SAMN12137985 B73 ZM5 SRS5023569 tassel primordia Plant sample from Zea mays subsp. mays

3888 rows × 15 columns


In [8]:
samples_table.to_csv('biosample_result.csv')

In [ ]: