Skip to content

Instantly share code, notes, and snippets.

@pebbie
Created October 9, 2019 14:59
Show Gist options
  • Save pebbie/a3c9795a20510f2765c3d707eaff4e5d to your computer and use it in GitHub Desktop.
Save pebbie/a3c9795a20510f2765c3d707eaff4e5d to your computer and use it in GitHub Desktop.

Revisions

  1. pebbie created this gist Oct 9, 2019.
    66 changes: 66 additions & 0 deletions rdf2pandas.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,66 @@
    """
    file: rdf2pandas.py
    auth: Peb Ruswono Aryan
    desc: import data in RDF Data Cube (assumed in particular shape) from Graph to Pandas DataFrame
    """
    from rdflib import Graph, Namespace, RDF, RDFS
    import pandas as pd

    QB = Namespace('http://purl.org/linked-data/cube#')
    DCT = Namespace('http://purl.org/dc/terms/')

    def short_name(uristr: str) -> str:
    hpos = uristr.rindex('#') if '#' in uristr else -1
    spos = uristr.rindex('/') if '/' in uristr else -1
    return uristr[max(hpos,spos)+1:]

    def from_graph(g : Graph) -> pd.DataFrame :
    """
    import DataFrame from rdflib.Graph
    expects shape :
    ?ds RDF.type QB.DataSet
    ?ds QB.structure ?dsd
    ?dsd QB.component [QB.dimension ?dim] || ?dsd QB.component [QB.measure ?mea]
    ?obs QB.dataSet ?ds
    ?obs ?dim ?dimval || ?obs ?mea ?meaval
    """
    data = []
    columns = []
    dss = list(g.subjects(RDF.type, QB.DataSet))
    if len(dss)>0:
    ds = dss[0]
    dsds = list(g.objects(ds, QB.structure))
    if len(dsds)>0:
    dsd = dsds[0]

    dims = []
    meas = []
    for c in g.objects(dsd, QB.component):
    dim = g.value(c, QB.dimension)
    mea = g.value(c, QB.measure)
    if dim is not None:
    dims.append(dim)
    elif mea is not None:
    meas.append(mea)
    comps = dims + meas
    for c in comps:
    # try if there's some label annotation in the graph
    lbl = g.value(c, RDFS.label)
    if lbl is not None:
    columns.append(lbl)
    continue
    lbl = g.value(c, DCT.title)
    if lbl is not None:
    columns.append(lbl)
    continue
    lbl = short_name(str(c))
    columns.append(lbl)

    for obs in g.subjects(QB.dataSet, ds):
    row = []
    for c in comps:
    row.append(g.value(obs, c))
    data.append(row)

    df = pd.DataFrame(data, columns = columns)
    return df