Source code for abydos.distance._gotoh

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._gotoh.

Gotoh score
"""
from typing import Any, Callable, Optional, cast

from numpy import float_ as np_float
from numpy import zeros as np_zeros

from ._needleman_wunsch import NeedlemanWunsch

__all__ = ['Gotoh']


[docs] class Gotoh(NeedlemanWunsch): """Gotoh score. The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with affine gap penalties. .. versionadded:: 0.3.6 """ def __init__( self, gap_open: float = 1, gap_ext: float = 0.4, sim_func: Optional[Callable[[str, str], float]] = None, **kwargs: Any ) -> None: """Initialize Gotoh instance. Parameters ---------- gap_open : float The cost of an open alignment gap (1 by default) gap_ext : float The cost of an alignment gap extension (0.4 by default) sim_func : function A function that returns the similarity of two characters (identity similarity by default) **kwargs Arbitrary keyword arguments .. versionadded:: 0.4.0 """ super(Gotoh, self).__init__(**kwargs) self._gap_open = gap_open self._gap_ext = gap_ext self._sim_func = cast( Callable[[str, str], float], NeedlemanWunsch.sim_matrix if sim_func is None else sim_func, ) # type: Callable[[str, str], float]
[docs] def sim_score(self, src: str, tar: str) -> float: """Return the Gotoh score of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Gotoh score Examples -------- >>> cmp = Gotoh() >>> cmp.sim_score('cat', 'hat') 2.0 >>> cmp.sim_score('Niall', 'Neil') 1.0 >>> round(cmp.sim_score('aluminum', 'Catalan'), 12) -0.4 >>> cmp.sim_score('cat', 'hat') 2.0 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) d_mat[0, 0] = 0 p_mat[0, 0] = float('-inf') q_mat[0, 0] = float('-inf') for i in range(1, len(src) + 1): d_mat[i, 0] = float('-inf') p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1) q_mat[i, 0] = float('-inf') if len(tar) > 1: q_mat[i, 1] = -self._gap_open for j in range(1, len(tar) + 1): d_mat[0, j] = float('-inf') p_mat[0, j] = float('-inf') if len(src) > 1: p_mat[1, j] = -self._gap_open q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1) for i in range(1, len(src) + 1): for j in range(1, len(tar) + 1): sim_val = self._sim_func(src[i - 1], tar[j - 1]) d_mat[i, j] = max( d_mat[i - 1, j - 1] + sim_val, p_mat[i - 1, j - 1] + sim_val, q_mat[i - 1, j - 1] + sim_val, ) p_mat[i, j] = max( d_mat[i - 1, j] - self._gap_open, p_mat[i - 1, j] - self._gap_ext, ) q_mat[i, j] = max( d_mat[i, j - 1] - self._gap_open, q_mat[i, j - 1] - self._gap_ext, ) i, j = (n - 1 for n in d_mat.shape) return cast(float, max(d_mat[i, j], p_mat[i, j], q_mat[i, j]))
[docs] def sim(self, src: str, tar: str) -> float: """Return the normalized Gotoh score of two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- float Normalized Gotoh score Examples -------- >>> cmp = Gotoh() >>> cmp.sim('cat', 'hat') 0.6666666666666667 >>> cmp.sim('Niall', 'Neil') 0.22360679774997896 >>> round(cmp.sim('aluminum', 'Catalan'), 12) 0.0 >>> cmp.sim('cat', 'hat') 0.6666666666666667 .. versionadded:: 0.4.1 """ if src == tar: return 1.0 return max(0.0, self.sim_score(src, tar)) / ( self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5 )
if __name__ == '__main__': import doctest doctest.testmod()