Load libraries.
In [1]:
using LightXML
using DataArrays
using DataFrames
In [2]:
png_xdoc = parse_file("PNG.gbc.xml");
I start by identifying the root element.
In [3]:
png_xroot = root(png_xdoc);
I extract all the sequences and accession numbers as lists, the latter using a comprehension.
In [4]:
png_sequences = get_elements_by_tagname(png_xroot, "INSDSeq")
png_accessions = [content(find_element(s,"INSDSeq_primary-accession")) for s in png_sequences];
In [5]:
png_numseq=length(png_sequences)
Out[5]:
To extract all the information that is held in the list of INSDQualifiers, I loop through all the sequences and generate a dictionary with accession as the key and a dictionary of qualifiers as the value.
I start by initialising an empty dictionary, with strings as both the key and the value.
In [6]:
png_seq_dict=Dict{ASCIIString,Dict{ASCIIString,ASCIIString}}()
Out[6]:
Extracting the information is a mixture of find_element and find_elements_by_tagname to search for the right elements, get_elements_by_tagname, and finally using content to extract the contents of the qualifiers.
In [7]:
for i in 1:png_numseq
s=png_sequences[i]
accession=content(find_element(s, "INSDSeq_primary-accession"))
feature_table=find_element(s,"INSDSeq_feature-table")
features=get_elements_by_tagname(feature_table,"INSDFeature")
feature_quals=get_elements_by_tagname(features[1], "INSDFeature_quals")
qualifiers=get_elements_by_tagname(feature_quals[1], "INSDQualifier")
qualifier_dict=Dict{ASCIIString,ASCIIString}()
for q in qualifiers
n=find_element(q,"INSDQualifier_name")
v=find_element(q,"INSDQualifier_value")
if v!=nothing
qualifier_dict[content(n)]=content(v)
end
end
png_seq_dict[accession]=qualifier_dict
end;
To flatten the dictionary, I first make a dictionary of all feature names, with the number of times the field is found.
In [8]:
png_fn_dict=Dict{ASCIIString,Int64}()
for acc in keys(png_seq_dict)
features=png_seq_dict[acc]
for k in keys(features)
current_count=get(png_fn_dict,k,0)
png_fn_dict[k]=current_count+1
end
end
png_fn_dict
Out[8]:
I extract the names of the qualifiers as a list, that will be used below to construct a DataFrame.
In [9]:
png_feature_names=collect(keys(png_fn_dict))
Out[9]:
I then loop through each feature name, for each sequence, determine whether the feature is present, and construct a DataArray, which is then added to a DataFrame.
In [11]:
png_df=DataFrame(accession=png_accessions)
png_numfeatures=length(png_feature_names)
for i in 1:png_numfeatures
key=png_feature_names[i]
dv=DataArray(ASCIIString[],Bool[])
for j in 1:png_numseq
acc=png_accessions[j]
f=png_seq_dict[acc]
val=get(f,key,NA) # NA is the default
push!(dv,val)
end
png_df[symbol(key)]=dv
end;
I now have a DataFrame that has the features in a flat format.
In [12]:
head(png_df)
Out[12]:
In [13]:
display(countmap(png_df[:isolate]))
The annotations can now be written to file as a table.
In [14]:
writetable("png_annotations.txt", png_df, separator = '\t', header = true)
In [15]:
png_seqstrings=[content(find_element(s,"INSDSeq_sequence")) for s in png_sequences];
There are various options to output these sequences as a FASTA file. Perhaps the most straightforward is to using the @printf macro, which if given a stream as the first argument, will print formatted text to the stream.
In [16]:
f=open("png_sequences.fas","w")
for i in 1:png_numseq
@printf(f,">%s\n%s\n",png_accessions[i],png_seqstrings[i])
end
close(f)