Display the development of the PDB over time

This is supplementary data for "The Life Cycle of Structural Biology Data", submitted to the Data Science Journal.

New Entries Released Each Year



In [3]:

    
import pandas

PDBE_SOLR_URL = "http://www.ebi.ac.uk/pdbe/search/pdb"                 
# or https://www.ebi.ac.uk/pdbe/search/pdb/select?rows=0&q=status:REL&wt=json

from mysolr import Solr
solr = Solr(PDBE_SOLR_URL, version=4)

response = solr.search(**{
        "rows" : 0, # "fl" : "deposition_year", 
        "q" : 'status:REL',
        "facet" : "true", "facet.limit" : 1000000, "facet.mincount" : 1,
        "facet.field" : "deposition_year",
    }
)
from matplotlib import pyplot as plt

# skip current year, which is not yet complete
years = sorted( response.facets['facet_fields']['deposition_year'] )[0:-1]
ticks = [1980, 1990, 2000, 2010]

fig = plt.figure()
plt.yscale('log')
plt.ylabel('PDB entries released')
plt.plot(years, [ response.facets['facet_fields']['deposition_year'][year] for year in years])
plt.xticks(ticks, [str(t) for t in ticks])
fig.autofmt_xdate()
plt.savefig('../../images/PDB.eps', format='eps', dpi=1000)
plt.show()

Complexes by year

If an entry referes to more than one macromolecule (i.e. is heteromeric) then the work is more challenging that research of a single protein.

There is another case. A project might target a homodimer, or more generally a homo-oligomer. For a homomeric protein structure, the PDB does not record the submitter's judgement of whether the contacts between the molecules are complexation of biological relevance, or merely crystal contacts. So we cannot distinguish targeted work on homo-complexes from other work on single species. We therefore report simply the proportion of projects that aim at one rather than many macromolecular species.



In [22]:

    
import collections

# returns all values, in increasing order of frequency
def get_values(field):
    response = solr.search(**{
        "rows" : 0, "fl" : "deposition_year", 
        "q" : 'status:REL',
        "facet" : "true", "facet.limit" : 10000000, "facet.mincount" : 1,
        "facet.field" : field,
        "group" : "true", "group.facet" : "true",
        "group.field" : "pdb_id",
    })
    d = response.facets['facet_fields'][field]
    return  collections.OrderedDict(reversed(list(d.items())))
values = get_values('assembly_form')
values









    Out[22]:





OrderedDict([('hetero', 31116), ('homo', 110874)])



In [36]:

    
from string import Template

def get_timeline(field):
    response = solr.search(**{
            "rows" : 0, "fl" : "deposition_year", 
            "q" : 'status:REL',
            'json.facet': Template("""{
                deposition_year:{
                    type:range,start:1971,end:2018,gap:1,field:deposition_year,limit:20,
                    facet:{
                        facet1:{
                            type:terms, field: $field,
                            facet:{
                                grouped_facet_count:\"unique(pdb_id)\"
                            }
                        },
                        grouped_depositionyear_count:\"unique(pdb_id)\"
                    }
                }
            }""").substitute(field=field)
        }
    )
    return response.raw_content['facets']['deposition_year']['buckets']

timeline = get_timeline('assembly_form')

# see an example of an annual report
timeline[-4]









    Out[36]:





{'val': 2014,
 'count': 19875,
 'grouped_depositionyear_count': 9365,
 'facet1': {'buckets': [{'val': 'hetero',
    'count': 12649,
    'grouped_facet_count': 2244},
   {'val': 'homo', 'count': 7121, 'grouped_facet_count': 7121}]}}



In [38]:

    
def get_df(synonyms, timeline):
    counts0 = {'unknown':0}
    for value in synonyms:
        counts0[ synonyms[value] ] = 0
    rows = []
    for bucket in timeline:
        year = int(bucket['val'])
        counts = dict(counts0)
        if bucket['count'] > 0:
            total = bucket['grouped_depositionyear_count']
            for bbucket in bucket['facet1']['buckets']:
                synonym = synonyms[ bbucket['val'] ]
                counts[ synonym ] = counts.get(synonym, 0)+bbucket['grouped_facet_count']
                total = total - bbucket['grouped_facet_count']
            counts['unknown'] = total
        rows.append([year]+[ counts[label] for label in counts0])  

    df = pandas.DataFrame(rows, columns=['year']+ list(counts0.keys()) )
    df = df[df.year>1977]
    return df

df = get_df( {'homo':'homomeric', 'hetero':'heteromeric'}, timeline)
df.head()









    Out[38]:







  
    
      
      year
      unknown
      homomeric
      heteromeric
    
  
  
    
      7
      1978
      0
      4
      0
    
    
      8
      1979
      0
      9
      3
    
    
      9
      1980
      0
      7
      1
    
    
      10
      1981
      0
      23
      2
    
    
      11
      1982
      0
      33
      9



In [40]:

    
def plot_df(df, values):    
    df['total'] = pandas.Series([0]*len(df), index=df.index)
    for value in values:
        df['total'] = df['total'] + df[value]
    running_total = pandas.Series([0]*len(df), index=df.index)
    for value in values:
        series = df[value] / df['total']
        plt.fill_between(df.year, running_total, running_total+series, label=values[value])
        running_total = running_total + series
    plt.fill_between(df.year, running_total, [1]*len(df), color='black', label='unknown' )

    
plt.figure()
plot_df(df, {'heteromeric': 'heteromeric', 'homomeric': 'homomeric'})
plt.legend()
plt.title('Fraction of new structures that contain more than one macromolecular species')
plt.ylim([0, 1])
plt.savefig('../../images/complexes.eps', format='eps', dpi=1000)
plt.show()

New entries, by superkingdom



In [41]:

    
values = get_values('superkingdom')

timeline = get_timeline('superkingdom')


df = get_df({value:value for value in values}, timeline)

plt.figure()
plot_df(df, {value: value for value in values})
plt.legend()
plt.show()

Count Entries by cellular location



In [44]:

    
values = get_values("biological_cell_component")

d = {value : ('membrane' if 'membrane' in value else 'other') for value in values}


timeline = get_timeline("biological_cell_component")
df = get_df(d, timeline)
plt.figure()
plot_df(df, {'membrane':'membrane', 'other':'other'})
plt.legend()
plt.show()

Count entries by experiment type



In [63]:

    
values = get_values("experimental_method")
d = {}
for value in values:
    d[value] = value if values[value]>120 else 'other'

timeline = get_timeline("experimental_method")
df = get_df(d, timeline)

plt.figure()
plot_df(df, {d[value]:d[value] for value in d})
plt.legend()
plt.show()