google-research
223 строки · 7.4 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Smith-Waterman functions for protein alignment in NumPy."""
17
18from typing import Optional19
20import numpy as np21from scipy import special22
23
24def _alignment_matrices(len_1, len_2, i = 0, j = 0,25curr = "M", prev = "S",26alignment = None):27"""Helper function for alignment_matrices."""28if alignment is None:29alignment = np.zeros((len_1, len_2, 9))30
31# M=match, X=gap_x, Y=gap_y32lookup = {33("S", "M"): 0,34("M", "M"): 1,35("X", "M"): 2,36("Y", "M"): 3,37("M", "X"): 4,38("X", "X"): 5,39("M", "Y"): 6,40("X", "Y"): 7,41("Y", "Y"): 8,42}43
44alignment[i, j, lookup[(prev, curr)]] = 145if curr == "M":46yield alignment47
48if i < len_1 - 1:49# We can go down.50yield from _alignment_matrices(len_1, len_2, i=i+1, j=j, curr="Y",51prev=curr, alignment=alignment.copy())52
53if i < len_1 - 1 and j < len_2 - 1:54# We can go in diagonal.55yield from _alignment_matrices(len_1, len_2, i=i+1, j=j+1, curr="M",56prev=curr, alignment=alignment.copy())57if j < len_2 - 1 and curr != "Y":58# We can go right.59yield from _alignment_matrices(len_1, len_2, i=i, j=j+1,60curr="X", prev=curr,61alignment=alignment.copy())62
63
64def alignment_matrices(len_1, len_2):65"""Generator of all alignment matrices of shape (len_1, len_2, 9).66
67Args:
68len_1: length of first sequence.
69len_2: length of second sequence.
70
71Yields:
72All alignment matrices of shape (len_1, len_2, 9)
73"""
74# Iterates over all possible starting states.75for i in range(len_1):76for j in range(len_2):77yield from _alignment_matrices(len_1, len_2, i=i, j=j)78
79
80def _make_op(temperature=1.0):81"""Make softmax + softargmax operator."""82def op(*args):83if len(args) == 1: # op(arr)84arr = np.array(args[0])85else: # lse_op(ele1, ele2, ...)86arr = np.array(args)87if temperature > 0:88return (temperature * special.logsumexp(arr / temperature),89special.softmax(arr / temperature))90else:91ret = np.zeros_like(arr)92ret[np.argmax(arr)] = 193return np.max(arr), ret94return op95
96
97def _soft_sw_affine(sim_mat,98gap_open,99gap_extend,100temperature = 1.0,101ret_grads = False):102"""Computes soft Smith-Waterman with affine gaps.103
104Args:
105sim_mat: a np.ndarray<float>[len1, len2] the substitution
106values for pairs of sequences.
107gap_open: float penalty in the substitution values for opening a gap.
108gap_extend: float of penalty in the substitution values for extending a gap.
109temperature: float controlling the regularization strength.
110ret_grads: whether to return gradients or not.
111
112Returns:
113value if ret_grads is False
114value, g_sim_mat, g_gap_open, g_gap_extend if ret_grads is True
115
116value = float of softmax values
117g_sim_mat = np.ndarray<float>[len_1, len_2]
118g_gap_open = float
119g_gap_extend = float
120"""
121len_1, len_2 = sim_mat.shape122
123match = np.zeros((len_1 + 1, len_2 + 1))124match_p = np.zeros((len_1 + 2, len_2 + 2, 4))125
126gap_x = np.zeros((len_1 + 1, len_2 + 1))127gap_x_p = np.zeros((len_1 + 2, len_2 + 2, 2))128
129gap_y = np.zeros((len_1 + 1, len_2 + 1))130gap_y_p = np.zeros((len_1 + 2, len_2 + 2, 3))131
132float_max = np.finfo(np.float32).max133
134if temperature > 0:135for mat in (match, gap_x, gap_y):136mat[0, :] = mat[:, 0] = -float_max137
138op = _make_op(temperature=temperature)139
140for i in range(1, len_1 + 1):141for j in range(1, len_2 + 1):142match[i, j], match_p[i, j] = op(0, match[i-1, j-1],143gap_x[i-1, j-1], gap_y[i-1, j-1])144match[i, j] += sim_mat[i-1, j-1]145gap_x[i, j], gap_x_p[i, j] = op(match[i, j-1] - gap_open,146gap_x[i, j-1] - gap_extend)147gap_y[i, j], gap_y_p[i, j] = op(match[i-1, j] - gap_open,148gap_x[i-1, j] - gap_open,149gap_y[i-1, j] - gap_extend)150
151value, probas = op(match.ravel())152probas = probas.reshape(match.shape)153
154if not ret_grads:155return value156
157match_e = np.zeros((len_1 + 2, len_2 + 2))158gap_x_e = np.zeros((len_1 + 2, len_2 + 2))159gap_y_e = np.zeros((len_1 + 2, len_2 + 2))160
161for j in reversed(range(1, len_2 + 1)):162for i in reversed(range(1, len_1 + 1)):163gap_y_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 3] +164gap_y_e[i+1, j] * gap_y_p[i+1, j, 2])165
166gap_x_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 2] +167gap_x_e[i, j+1] * gap_x_p[i, j+1, 1] +168gap_y_e[i+1, j] * gap_y_p[i+1, j, 1])169
170match_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 1] +171gap_x_e[i, j+1] * gap_x_p[i, j+1, 0] +172gap_y_e[i+1, j] * gap_y_p[i+1, j, 0] +173probas[i, j])174
175g_sim_mat = np.zeros_like(sim_mat)176g_gap_open = np.zeros_like(sim_mat)177g_gap_extend = np.zeros_like(sim_mat)178for i in range(1, len_1 + 1):179for j in range(1, len_2 + 1):180g_sim_mat[i-1, j-1] = match_e[i, j]181g_gap_open[i-1, j-1] = (gap_x_e[i, j+1] * (-gap_x_p[i, j+1, 0]) +182gap_y_e[i+1, j] * (-gap_y_p[i+1, j, 0] -183gap_y_p[i+1, j, 1]))184g_gap_extend[i-1, j-1] = (gap_x_e[i, j+1] * (-gap_x_p[i, j+1, 1]) +185gap_y_e[i+1, j] * (-gap_y_p[i+1, j, 2]))186
187return value, g_sim_mat, np.sum(g_gap_open), np.sum(g_gap_extend)188
189
190def soft_sw_affine(sim_mat,191gap_open,192gap_extend,193temperature = 1.0,194ret_grads = False):195"""Computes soft Smith-Waterman with affine gaps.196
197Args:
198sim_mat: a np.ndarray<float>[batch, len1, len2] the substitution
199values for pairs of sequences.
200gap_open: a np.ndarray<float>[batch] of penalty in the substitution values
201for opening a gap.
202gap_extend: a np.ndarray<float>[batch] of penalty in the substitution values
203for extending a gap.
204temperature: float controlling the regularization strength.
205ret_grads: whether to return gradients or not.
206
207Returns:
208values if ret_grads is False
209values, g_sim_mat, g_gap_open, g_gap_extend if ret_grads is True
210
211values = np.ndarray<float>[batch] of softmax values
212g_sim_mat = np.ndarray<float>[batch, len_1, len_2]
213g_gap_open = np.ndarray<float>[batch]
214g_gap_extend = np.ndarray<float>[batch]
215"""
216# TODO(mblondel): avoid naive for loop.217arr = [_soft_sw_affine(sim_mat[i], gap_open[i], gap_extend[i],218temperature=temperature, ret_grads=ret_grads)219for i in range(sim_mat.shape[0])]220if ret_grads:221return tuple(np.array(elements) for elements in zip(*arr))222else:223return np.array(arr)224