# -*- coding: utf-8 -*-
"""
A collection of utilities for canonicalizing and inspecting graphs.
Among other things, they solve of the problem of deterministic bnode
comparisons.
Warning: the time to canonicalize bnodes may increase exponentially on larger
graphs. Use with care!
Example of comparing two graphs::
>>> g1 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel
... <http://example.org/same>,
... [ :label "Same" ],
... <http://example.org/a>,
... [ :label "A" ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel
... <http://example.org/same>,
... [ :label "Same" ],
... <http://example.org/b>,
... [ :label "B" ] .
... ''')
>>>
>>> iso1 = to_isomorphic(g1)
>>> iso2 = to_isomorphic(g2)
These are not isomorphic::
>>> iso1 == iso2
False
Diff the two graphs::
>>> in_both, in_first, in_second = graph_diff(iso1, iso2)
Present in both::
>>> def dump_nt_sorted(g):
... for l in sorted(g.serialize(format='nt').splitlines()):
... if l: print(l.decode('ascii'))
>>> dump_nt_sorted(in_both) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/same> .
<http://example.org>
<http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
_:cbcaabaaba17fecbc304a64f8edee4335e
<http://example.org/ns#label> "Same" .
Only in first::
>>> dump_nt_sorted(in_first) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/a> .
<http://example.org>
<http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
_:cb124e4c6da0579f810c0ffe4eff485bd9
<http://example.org/ns#label> "A" .
Only in second::
>>> dump_nt_sorted(in_second) #doctest: +SKIP
<http://example.org>
<http://example.org/ns#rel> <http://example.org/b> .
<http://example.org>
<http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
_:cb558f30e21ddfc05ca53108348338ade8
<http://example.org/ns#label> "B" .
"""
# TODO:
# - Doesn't handle quads.
# - Add warning and/or safety mechanism before working on large graphs?
# - use this in existing Graph.isomorphic?
__all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic',
'to_canonical_graph', 'graph_diff', 'similar']
from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate
from rdflib.term import BNode
try:
import hashlib
md = hashlib.md5
except ImportError:
# for Python << 2.5
import md5
md = md5.new
[docs]class IsomorphicGraph(ConjunctiveGraph):
"""
Ported from
<http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py>
(Sean B Palmer's RDF Graph Isomorphism Tester).
"""
[docs] def __init__(self, **kwargs):
super(IsomorphicGraph, self).__init__(**kwargs)
[docs] def __eq__(self, other):
"""Graph isomorphism testing."""
if not isinstance(other, IsomorphicGraph):
return False
elif len(self) != len(other):
return False
elif list(self) == list(other):
return True # TODO: really generally cheaper?
return self.internal_hash() == other.internal_hash()
[docs] def __ne__(self, other):
"""Negative graph isomorphism testing."""
return not self.__eq__(other)
[docs] def internal_hash(self):
"""
This is defined instead of __hash__ to avoid a circular recursion
scenario with the Memory store for rdflib which requires a hash lookup
in order to return a generator of triples.
"""
return _TripleCanonicalizer(self).to_hash()
class _TripleCanonicalizer(object):
def __init__(self, graph, hashfunc=hash):
self.graph = graph
self.hashfunc = hashfunc
def to_hash(self):
return self.hashfunc(tuple(sorted(
map(self.hashfunc, self.canonical_triples()))))
def canonical_triples(self):
for triple in self.graph:
yield tuple(self._canonicalize_bnodes(triple))
def _canonicalize_bnodes(self, triple):
for term in triple:
if isinstance(term, BNode):
yield BNode(value="cb%s" % self._canonicalize(term))
else:
yield term
def _canonicalize(self, term, done=False):
return self.hashfunc(tuple(sorted(self._vhashtriples(term, done),
key=_hetero_tuple_key)))
def _vhashtriples(self, term, done):
for triple in self.graph:
if term in triple:
yield tuple(self._vhashtriple(triple, term, done))
def _vhashtriple(self, triple, target_term, done):
for i, term in enumerate(triple):
if not isinstance(term, BNode):
yield term
elif done or (term == target_term):
yield i
else:
yield self._canonicalize(term, done=True)
def _hetero_tuple_key(x):
"Sort like Python 2 - by name of type, then by value. Expects tuples."
return tuple((type(a).__name__, a) for a in x)
[docs]def to_isomorphic(graph):
if isinstance(graph, IsomorphicGraph):
return graph
return IsomorphicGraph(store=graph.store)
[docs]def isomorphic(graph1, graph2):
"""
Compare graph for equality. Uses an algorithm to compute unique hashes
which takes bnodes into account.
Examples::
>>> g1 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel <http://example.org/a> .
... <http://example.org> :rel <http://example.org/b> .
... <http://example.org> :rel [ :label "A bnode." ] .
... ''')
>>> g2 = Graph().parse(format='n3', data='''
... @prefix ns: <http://example.org/ns#> .
... <http://example.org> ns:rel [ ns:label "A bnode." ] .
... <http://example.org> ns:rel <http://example.org/b>,
... <http://example.org/a> .
... ''')
>>> isomorphic(g1, g2)
True
>>> g3 = Graph().parse(format='n3', data='''
... @prefix : <http://example.org/ns#> .
... <http://example.org> :rel <http://example.org/a> .
... <http://example.org> :rel <http://example.org/b> .
... <http://example.org> :rel <http://example.org/c> .
... ''')
>>> isomorphic(g1, g3)
False
"""
return _TripleCanonicalizer(graph1).to_hash() == \
_TripleCanonicalizer(graph2).to_hash()
[docs]def to_canonical_graph(g1):
"""
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical MD5 checksums, correlated with the graph contents.
"""
graph = Graph()
graph += _TripleCanonicalizer(g1, _md5_hash).canonical_triples()
return ReadOnlyGraphAggregate([graph])
[docs]def graph_diff(g1, g2):
"""
Returns three sets of triples: "in both", "in first" and "in second".
"""
# bnodes have deterministic values in canonical graphs:
cg1 = to_canonical_graph(g1)
cg2 = to_canonical_graph(g2)
in_both = cg1 * cg2
in_first = cg1 - cg2
in_second = cg2 - cg1
return (in_both, in_first, in_second)
def _md5_hash(t):
h = md()
for i in t:
if isinstance(i, tuple):
h.update(_md5_hash(i).encode('ascii'))
else:
h.update(unicode(i).encode("utf8"))
return h.hexdigest()
_MOCK_BNODE = BNode()
[docs]def similar(g1, g2):
"""
Checks if the two graphs are "similar", by comparing sorted triples where
all bnodes have been replaced by a singular mock bnode (the
``_MOCK_BNODE``).
This is a much cheaper, but less reliable, alternative to the comparison
algorithm in ``isomorphic``.
"""
return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))
def _squashed_graphs_triples(g1, g2):
for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))):
yield t1, t2
def _squash_graph(graph):
return (_squash_bnodes(triple) for triple in graph)
def _squash_bnodes(triple):
return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)