In [20]:
import rdflib
import pandas as pd
import numpy as np

g=rdflib.Graph()

# all issued and modified are actually from dcterms and only of date
# dcterms:issued "1999-07-02"^^<http://www.w3.org/2001/XMLSchema#date> ;
# dcterms:modified "2012-06-14"^^<http://www.w3.org/2001/XMLSchema#date> ;

def reformat(predicate):
    """
    # replace http://purl.org/dc/elements/1.1/type with dc:type ...first!
    # "dc:description", "dc:source", "dc:title", "dc:type", "dc:publisher",

    # replace http://purl.org/dc/elements/1.1/ rest with dcterms:
    # replace http://purl.org/dc/terms/ terms of course with dcterms:

    # replace http://www.w3.org/1999/02/22-rdf-syntax-ns# with rdf:
    # replace http://www.w3.org/2004/02/skos/core# with skos:

    # filter only for http://vocab.smart-project.info/papawai/term/ and remove
    # http://www.w3.org/2004/02/skos/core#inCollection skos:inCollection
    """
    if "http://purl.org/dc/elements/1.1/type" in predicate:
        return "dc:type"
    elif "http://purl.org/dc/elements/1.1/description" in predicate:
        return "dc:description"
    elif "http://purl.org/dc/elements/1.1/source" in predicate:
        return "dc:source"
    elif "http://purl.org/dc/elements/1.1/title" in predicate:
        return "dc:title"
    elif "http://purl.org/dc/elements/1.1/publisher" in predicate:
        return "dc:publisher"
    elif "http://purl.org/dc/elements/1.1/" in predicate:
        return predicate.replace("http://purl.org/dc/elements/1.1/", "dcterms:")
    elif "http://purl.org/dc/terms/" in predicate:
        return predicate.replace("http://purl.org/dc/terms/", "dcterms:")
    elif "http://www.w3.org/1999/02/22-rdf-syntax-ns#" in predicate:
        return predicate.replace("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdf:")
    elif "http://www.w3.org/2004/02/skos/core#" in predicate:
        return predicate.replace("http://www.w3.org/2004/02/skos/core#", "skos:")

def generate(graph, col_id, hier):
    for s,p,o in graph:
        term_short = "http://vocab.smart-project.info/{}/{}/".format(col_id, hier)
        if term_short in s:
            newPredicate = reformat(p)
            if "skos:inCollection" not in newPredicate and "rdf:type" not in newPredicate:
                term_id = s.replace(term_short, "")
                # print(term_id,newPredicate,o)
                yield {'term': term_id, 'fields': newPredicate, 'data': o}

# g.parse('http://vocab.smart-project.info/spq-papawai/data', format="n3")
# collection = "papawai"
# hierarchy = "term"

# g.parse('http://vocab.smart-project.info/spq-ngmp/data', format="n3")
# collection = "ngmp"
# hierarchy = "phenomenon"

# g.parse('http://vocab.smart-project.info/spq-glossary/data', format="n3")
# collection = "glossary"
# hierarchy = "term"

g.parse('http://vocab.smart-project.info/spq-awahou/data', format="n3")
collection = "awahou"
hierarchy = "term"

rows = list(generate(g, collection, hierarchy))
df = pd.DataFrame(rows)
df.head(20)


Out[20]:
data fields term
0 GNS SR 2016/13 dcterms:bibliographicCitation 30
1 2016 dcterms:available 50
2 AL dc:publisher 47
3 Tephra skos:label 73
4 2016 dcterms:available 5
5 The portion of stream flow that is not runoff ... dc:description 7
6 AL dc:publisher 56
7 Vapour phase alteration dc:title 64
8 An accumulation of groundwater that is above t... dc:description 55
9 Perched skos:prefLabel 55
10 Water located below the ground surface, e.g., ... skos:definition 25
11 Material that is eroded or is reworked and dep... dc:description 2
12 Taupo Volcanic Zone (TVZ) skos:prefLabel 70
13 Tritium (3H) skos:prefLabel 61
14 MAV skos:label 41
15 Total dissolved solids (TDS) skos:prefLabel 72
16 The sum of all phosphorus compounds (reactive,... skos:definition 76
17 Ignimbrite skos:prefLabel 27
18 The most probable outcome based on a set of fa... dc:description 10
19 AL dc:publisher 54

In [21]:
df['fields'].unique()


Out[21]:
array(['dcterms:bibliographicCitation', 'dcterms:available',
       'dc:publisher', 'skos:label', 'dc:description', 'dc:title',
       'skos:prefLabel', 'skos:definition', 'dcterms:modified'],
      dtype=object)

In [22]:
from rdflib import Literal


max_field_list = []
    
def new_field_name(row):
    dat = row['data']
    field = row['fields']
    if isinstance(dat, Literal):
            if dat.language is not None:
                field = field + "@" + dat.language
    max_field_list.append(field)
    return field

df['fields'] = df.apply(new_field_name, axis=1)

df.head()


Out[22]:
data fields term
0 GNS SR 2016/13 dcterms:bibliographicCitation 30
1 2016 dcterms:available 50
2 AL dc:publisher 47
3 Tephra skos:label 73
4 2016 dcterms:available 5

In [23]:
grouped = df.groupby('term')

# elem = grouped.get_group('12')
# len(elem['fields'].tolist())
# display(elem)

df_list = []
uniq_set = set(max_field_list)

for name, group in grouped:
    term_id = name
    dc_fields = group['fields'].tolist()
    dc_set = set(dc_fields)
    missing_cols = uniq_set.difference(dc_set)
    data_list = group['data'].tolist()
    data_tuple = [(x) for x in data_list]
    # display(data_tuple)
    this_df_data = [data_tuple] 
    this_df = pd.DataFrame.from_records(this_df_data, columns=dc_fields)
    this_df['term_id'] = int(name)
    for label in missing_cols:
        this_df[label] = np.nan
    this_df.reindex(index=this_df['term_id'])
    df_list.append(this_df)
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     display(this_df.head())

display(len(df_list))

cols = list(uniq_set)
cols.append('term_id')
start_df = pd.DataFrame.from_records([], columns=cols)
display('start shape: ' + str(start_df.shape))
display(start_df)

for gdf in df_list:
    start_df = pd.concat([start_df, gdf], join='outer', axis=0, ignore_index=True)
    # display(gdf.head())
    # start_df.append(gdf)

# full_df = pd.concat(df_list, axis=1 join='inner')
full_df = start_df.copy()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(full_df.head(50))


82
'start shape: (0, 10)'
dcterms:bibliographicCitation dcterms:available dc:publisher dcterms:modified dc:description skos:prefLabel@en dc:title skos:label skos:definition@en term_id
dc:description dc:publisher dc:title dcterms:available dcterms:bibliographicCitation dcterms:modified skos:definition@en skos:label skos:prefLabel@en term_id
0 Material that is eroded or is reworked and dep... AL Alluvial sediment 2016 GNS SR 2016/13 2016-09-05T05:08:26.478Z Material that is eroded or is reworked and dep... Alluvial sediment Alluvial sediment 1
1 The most probable outcome based on a set of fa... AL Best-estimate 2016 GNS SR 2016/13 2016-09-05T05:08:26.658Z The most probable outcome based on a set of fa... Best-estimate Best-estimate 10
2 An area of land where surface water from rain,... AL Catchment 2016 GNS SR 2016/13 2016-09-05T05:08:26.659Z An area of land where surface water from rain,... Catchment Catchment 11
3 A rock composed of broken fragments of mineral... AL Breccia 2016 GNS SR 2016/13 2016-09-05T05:08:26.663Z A rock composed of broken fragments of mineral... Breccia Breccia 12
4 Organic compounds that contains carbon, chlori... AL Chlorofluorocarbons (CFCs) 2016 GNS SR 2016/13 2016-09-05T05:08:26.664Z Organic compounds that contains carbon, chlori... Chlorofluorocarbons (CFCs) Chlorofluorocarbons (CFCs) 13
5 A large crater at the top of a volcano formed ... AL Caldera Basin 2016 GNS SR 2016/13 2016-09-05T05:08:26.664Z A large crater at the top of a volcano formed ... Caldera Basin Caldera Basin 14
6 The lowest quantity of a substance that can be... AL Detection limit 2016 GNS SR 2016/13 2016-09-05T05:08:26.670Z The lowest quantity of a substance that can be... Detection limit Detection limit 15
7 An area of land where surface water from rain,... AL Catchment 2016 GNS SR 2016/13 2016-09-05T05:08:26.671Z An area of land where surface water from rain,... Catchment Catchment 16
8 Oxygen that is dissolved in water. It is an im... AL Dissolved Oxygen (DO) 2016 GNS SR 2016/13 2016-09-05T05:08:26.673Z Oxygen that is dissolved in water. It is an im... Dissolved Oxygen (DO) Dissolved Oxygen (DO) 17
9 Organic compounds that contains carbon, chlori... AL Chlorofluorocarbons (CFCs) 2016 GNS SR 2016/13 2016-09-05T05:08:26.674Z Organic compounds that contains carbon, chlori... Chlorofluorocarbons (CFCs) Chlorofluorocarbons (CFCs) 18
10 A measure of the dissolved (soluble) phosphoru... AL Dissolved Reactive Phosphorus (DRP) 2016 GNS SR 2016/13 2016-09-05T05:08:26.677Z A measure of the dissolved (soluble) phosphoru... Dissolved Reactive Phosphorus (DRP) Dissolved Reactive Phosphorus (DRP) 19
11 Material that is eroded or is reworked and dep... AL Alluvial sediment 2016 GNS SR 2016/13 2016-09-05T05:08:26.487Z Material that is eroded or is reworked and dep... Alluvial sediment Alluvial sediment 2
12 The lowest quantity of a substance that can be... AL Detection limit 2016 GNS SR 2016/13 2016-09-05T05:08:26.677Z The lowest quantity of a substance that can be... Detection limit Detection limit 20
13 Water located below the ground surface, e.g., ... AL Groundwater 2016 GNS SR 2016/13 2016-09-05T05:08:26.691Z Water located below the ground surface, e.g., ... Groundwater Groundwater 21
14 A part of a rock body that can be differentiat... v Facies 2016 GNS SR 2016/13 2016-09-05T05:08:26.697Z A part of a rock body that can be differentiat... Facies Facies 22
15 A method used to estimate the discharge (volum... AL Gauging 2016 GNS SR 2016/13 2016-09-05T05:08:26.705Z A method used to estimate the discharge (volum... Gauging Gauging 23
16 A hydrologic process where water moves downwar... AL Groundwater recharge 2016 GNS SR 2016/13 2016-09-05T05:08:26.706Z A hydrologic process where water moves downwar... Groundwater recharge Groundwater recharge 24
17 Water located below the ground surface, e.g., ... AL Groundwater 2016 GNS SR 2016/13 2016-09-05T05:08:26.707Z Water located below the ground surface, e.g., ... Groundwater Groundwater 25
18 A hydrologic process where water moves downwar... AL Groundwater recharge 2016 GNS SR 2016/13 2016-09-05T05:08:26.707Z A hydrologic process where water moves downwar... Groundwater recharge Groundwater recharge 26
19 A pumice-dominated pyroclastic flow deposit fo... AL Ignimbrite 2016 GNS SR 2016/13 2016-09-05T05:08:26.708Z A pumice-dominated pyroclastic flow deposit fo... Ignimbrite Ignimbrite 27
20 A pumice-dominated pyroclastic flow deposit fo... AL Ignimbrite 2016 GNS SR 2016/13 2016-09-05T05:08:26.709Z A pumice-dominated pyroclastic flow deposit fo... Ignimbrite Ignimbrite 28
21 Sediments deposited in lakes. AL Lacustrine sediments 2016 GNS SR 2016/13 2016-09-05T05:08:26.711Z Sediments deposited in lakes. Lacustrine sediments Lacustrine sediments 29
22 A measure for the amount of ammonia, a polluta... AL Ammoniacal-nitrogen (NH3-N) 2016 GNS SR 2016/13 2016-09-05T05:08:26.601Z A measure for the amount of ammonia, a polluta... Ammoniacal-nitrogen (NH3-N) Ammoniacal-nitrogen (NH3-N) 3
23 Median Absolute Deviation, used in statistical... AL MAD 2016 GNS SR 2016/13 2016-09-05T05:08:26.713Z Median Absolute Deviation, used in statistical... MAD MAD 30
24 Sediments deposited in lakes. AL Lacustrine sediments 2016 GNS SR 2016/13 2016-09-05T05:08:26.714Z Sediments deposited in lakes. Lacustrine sediments Lacustrine sediments 31
25 Median Absolute Deviation, used in statistical... AL MAD 2016 GNS SR 2016/13 2016-09-05T05:08:26.715Z Median Absolute Deviation, used in statistical... MAD MAD 32
26 Oxygen that is dissolved in water. It is an im... AL Dissolved Oxygen (DO) 2016 GNS SR 2016/13 2016-09-05T05:08:26.683Z Oxygen that is dissolved in water. It is an im... Dissolved Oxygen (DO) Dissolved Oxygen (DO) 33
27 A measure of a material's ability to conduct e... v Electrical conductivity (EC) 2016 GNS SR 2016/13 2016-09-05T05:08:26.684Z A measure of a material's ability to conduct e... Electrical conductivity (EC) Electrical conductivity (EC) 34
28 The sum of evaporation and plant transpiration... AL Evapotranspiration 2016 GNS SR 2016/13 2016-09-05T05:08:26.687Z The sum of evaporation and plant transpiration... Evapotranspiration Evapotranspiration 35
29 A measure of the dissolved (soluble) phosphoru... AL Dissolved Reactive Phosphorus (DRP) 2016 GNS SR 2016/13 2016-09-05T05:08:26.688Z A measure of the dissolved (soluble) phosphoru... Dissolved Reactive Phosphorus (DRP) Dissolved Reactive Phosphorus (DRP) 36
30 A part of a rock body that can be differentiat... v Facies 2016 GNS SR 2016/13 2016-09-05T05:08:26.689Z A part of a rock body that can be differentiat... Facies Facies 37
31 A measure of a material's ability to conduct e... v Electrical conductivity (EC) 2016 GNS SR 2016/13 2016-09-05T05:08:26.690Z A measure of a material's ability to conduct e... Electrical conductivity (EC) Electrical conductivity (EC) 38
32 A method used to estimate the discharge (volum... AL Gauging 2016 GNS SR 2016/13 2016-09-05T05:08:26.690Z A method used to estimate the discharge (volum... Gauging Gauging 39
33 A measure for the amount of ammonia, a polluta... AL Ammoniacal-nitrogen (NH3-N) 2016 GNS SR 2016/13 2016-09-05T05:08:26.611Z A measure for the amount of ammonia, a polluta... Ammoniacal-nitrogen (NH3-N) Ammoniacal-nitrogen (NH3-N) 4
34 The sum of evaporation and plant transpiration... AL Evapotranspiration 2016 GNS SR 2016/13 2016-09-05T05:08:26.691Z The sum of evaporation and plant transpiration... Evapotranspiration Evapotranspiration 40
35 Maximum Acceptable Value for particular water ... AL MAV 2016 GNS SR 2016/13 2016-09-05T05:08:26.716Z Maximum Acceptable Value for particular water ... MAV MAV 41
36 Maximum Acceptable Value for particular water ... AL MAV 2016 GNS SR 2016/13 2016-09-05T05:08:26.722Z Maximum Acceptable Value for particular water ... MAV MAV 42
37 The average amount of time that a water molecu... AL Mean Residence Time (MRT) 2016 GNS SR 2016/13 2016-09-05T05:08:26.725Z The average amount of time that a water molecu... Mean Residence Time (MRT) Mean Residence Time (MRT) 43
38 The average amount of time that a water molecu... AL Mean Residence Time (MRT) 2016 GNS SR 2016/13 2016-09-05T05:08:26.725Z The average amount of time that a water molecu... Mean Residence Time (MRT) Mean Residence Time (MRT) 44
39 Name of the geological period that occurred 25... AL Mesozoic 2016 GNS SR 2016/13 2016-09-05T05:08:26.731Z Name of the geological period that occurred 25... Mesozoic Mesozoic 45
40 Name of the geological period that occurred 25... AL Mesozoic 2016 GNS SR 2016/13 2016-09-05T05:08:26.731Z Name of the geological period that occurred 25... Mesozoic Mesozoic 46
41 National Groundwater Monitoring Programme, inv... AL NGMP 2016 GNS SR 2016/13 2016-09-05T05:08:26.732Z National Groundwater Monitoring Programme, inv... NGMP NGMP 47
42 National Groundwater Monitoring Programme, inv... AL NGMP 2016 GNS SR 2016/13 2016-09-05T05:08:26.733Z National Groundwater Monitoring Programme, inv... NGMP NGMP 48
43 A nutrient in water that is associated with la... AL Nitrate-nitrogen (NO3-N) 2016 GNS SR 2016/13 2016-09-05T05:08:26.734Z A nutrient in water that is associated with la... Nitrate-nitrogen (NO3-N) Nitrate-nitrogen (NO3-N) 49
44 The portion of stream flow that is not runoff ... AL Baseflow 2016 GNS SR 2016/13 2016-09-05T05:08:26.617Z The portion of stream flow that is not runoff ... Baseflow Baseflow 5
45 Nephelometric Turbidity Units, a unit used to ... AL NTU 2016 GNS SR 2016/13 2016-09-05T05:08:26.734Z Nephelometric Turbidity Units, a unit used to ... NTU NTU 50
46 A nutrient in water that is associated with la... AL Nitrate-nitrogen (NO3-N) 2016 GNS SR 2016/13 2016-09-05T05:08:26.736Z A nutrient in water that is associated with la... Nitrate-nitrogen (NO3-N) Nitrate-nitrogen (NO3-N) 51
47 An area in the North Island, within the TVZ, t... AL Okataina Volcanic Complex (OVC) 2016 GNS SR 2016/13 2016-09-05T05:08:26.737Z An area in the North Island, within the TVZ, t... Okataina Volcanic Complex (OVC) Okataina Volcanic Complex (OVC) 52
48 Nephelometric Turbidity Units, a unit used to ... AL NTU 2016 GNS SR 2016/13 2016-09-05T05:08:26.738Z Nephelometric Turbidity Units, a unit used to ... NTU NTU 53
49 An area in the North Island, within the TVZ, t... AL Okataina Volcanic Complex (OVC) 2016 GNS SR 2016/13 2016-09-05T05:08:26.741Z An area in the North Island, within the TVZ, t... Okataina Volcanic Complex (OVC) Okataina Volcanic Complex (OVC) 54

In [24]:
fields = []
for t in cols:
    if "dc:" in t:
        ln = t.replace("dc:", "http://purl.org/dc/elements/1.1/")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
    elif "dcterms:" in t:
        ln = t.replace("dcterms:", "http://purl.org/dc/terms/")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
    elif "skos:" in t:
        ln = t.replace("skos:", "http://www.w3.org/2004/02/skos/core#")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
        
df2 = pd.DataFrame.from_dict(fields)
df2.head()


Out[24]:
description element
0 http://purl.org/dc/terms/bibliographicCitation dcterms:bibliographicCitation
1 http://purl.org/dc/terms/available dcterms:available
2 http://purl.org/dc/elements/1.1/publisher dc:publisher
3 http://purl.org/dc/terms/modified dcterms:modified
4 http://purl.org/dc/elements/1.1/description dc:description

In [25]:
writer = pd.ExcelWriter(collection + '.xlsx')
full_df.to_excel(writer,'Terms')
df2.to_excel(writer,'TermsMeaning')
writer.save()
writer.close()

In [ ]: