google-research

smith_waterman_np.py
223 строки · 7.4 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Smith-Waterman functions for protein alignment in NumPy."""
17

18
from typing import Optional
19

20
import numpy as np
21
from scipy import special
22

23

24
def _alignment_matrices(len_1, len_2, i = 0, j = 0,
25
                        curr = "M", prev = "S",
26
                        alignment = None):
27
  """Helper function for alignment_matrices."""
28
  if alignment is None:
29
    alignment = np.zeros((len_1, len_2, 9))
30

31
  # M=match, X=gap_x, Y=gap_y
32
  lookup = {
33
      ("S", "M"): 0,
34
      ("M", "M"): 1,
35
      ("X", "M"): 2,
36
      ("Y", "M"): 3,
37
      ("M", "X"): 4,
38
      ("X", "X"): 5,
39
      ("M", "Y"): 6,
40
      ("X", "Y"): 7,
41
      ("Y", "Y"): 8,
42
  }
43

44
  alignment[i, j, lookup[(prev, curr)]] = 1
45
  if curr == "M":
46
    yield alignment
47

48
  if i < len_1 - 1:
49
    # We can go down.
50
    yield from _alignment_matrices(len_1, len_2, i=i+1, j=j, curr="Y",
51
                                   prev=curr, alignment=alignment.copy())
52

53
  if i < len_1 - 1 and j < len_2 - 1:
54
    # We can go in diagonal.
55
    yield from _alignment_matrices(len_1, len_2, i=i+1, j=j+1, curr="M",
56
                                   prev=curr, alignment=alignment.copy())
57
  if j < len_2 - 1 and curr != "Y":
58
    # We can go right.
59
    yield from _alignment_matrices(len_1, len_2, i=i, j=j+1,
60
                                   curr="X", prev=curr,
61
                                   alignment=alignment.copy())
62

63

64
def alignment_matrices(len_1, len_2):
65
  """Generator of all alignment matrices of shape (len_1, len_2, 9).
66

67
  Args:
68
    len_1: length of first sequence.
69
    len_2: length of second sequence.
70

71
  Yields:
72
    All alignment matrices of shape (len_1, len_2, 9)
73
  """
74
  # Iterates over all possible starting states.
75
  for i in range(len_1):
76
    for j in range(len_2):
77
      yield from _alignment_matrices(len_1, len_2, i=i, j=j)
78

79

80
def _make_op(temperature=1.0):
81
  """Make softmax + softargmax operator."""
82
  def op(*args):
83
    if len(args) == 1:  # op(arr)
84
      arr = np.array(args[0])
85
    else:  # lse_op(ele1, ele2, ...)
86
      arr = np.array(args)
87
    if temperature > 0:
88
      return (temperature * special.logsumexp(arr / temperature),
89
              special.softmax(arr / temperature))
90
    else:
91
      ret = np.zeros_like(arr)
92
      ret[np.argmax(arr)] = 1
93
      return np.max(arr), ret
94
  return op
95

96

97
def _soft_sw_affine(sim_mat,
98
                    gap_open,
99
                    gap_extend,
100
                    temperature = 1.0,
101
                    ret_grads = False):
102
  """Computes soft Smith-Waterman with affine gaps.
103

104
  Args:
105
    sim_mat: a np.ndarray<float>[len1, len2] the substitution
106
     values for pairs of sequences.
107
    gap_open: float penalty in the substitution values for opening a gap.
108
    gap_extend: float of penalty in the substitution values for extending a gap.
109
    temperature: float controlling the regularization strength.
110
    ret_grads: whether to return gradients or not.
111

112
  Returns:
113
    value if ret_grads is False
114
    value, g_sim_mat, g_gap_open, g_gap_extend if ret_grads is True
115

116
    value = float of softmax values
117
    g_sim_mat = np.ndarray<float>[len_1, len_2]
118
    g_gap_open = float
119
    g_gap_extend = float
120
  """
121
  len_1, len_2 = sim_mat.shape
122

123
  match = np.zeros((len_1 + 1, len_2 + 1))
124
  match_p = np.zeros((len_1 + 2, len_2 + 2, 4))
125

126
  gap_x = np.zeros((len_1 + 1, len_2 + 1))
127
  gap_x_p = np.zeros((len_1 + 2, len_2 + 2, 2))
128

129
  gap_y = np.zeros((len_1 + 1, len_2 + 1))
130
  gap_y_p = np.zeros((len_1 + 2, len_2 + 2, 3))
131

132
  float_max = np.finfo(np.float32).max
133

134
  if temperature > 0:
135
    for mat in (match, gap_x, gap_y):
136
      mat[0, :] = mat[:, 0] = -float_max
137

138
  op = _make_op(temperature=temperature)
139

140
  for i in range(1, len_1 + 1):
141
    for j in range(1, len_2 + 1):
142
      match[i, j], match_p[i, j] = op(0, match[i-1, j-1],
143
                                      gap_x[i-1, j-1], gap_y[i-1, j-1])
144
      match[i, j] += sim_mat[i-1, j-1]
145
      gap_x[i, j], gap_x_p[i, j] = op(match[i, j-1] - gap_open,
146
                                      gap_x[i, j-1] - gap_extend)
147
      gap_y[i, j], gap_y_p[i, j] = op(match[i-1, j] - gap_open,
148
                                      gap_x[i-1, j] - gap_open,
149
                                      gap_y[i-1, j] - gap_extend)
150

151
  value, probas = op(match.ravel())
152
  probas = probas.reshape(match.shape)
153

154
  if not ret_grads:
155
    return value
156

157
  match_e = np.zeros((len_1 + 2, len_2 + 2))
158
  gap_x_e = np.zeros((len_1 + 2, len_2 + 2))
159
  gap_y_e = np.zeros((len_1 + 2, len_2 + 2))
160

161
  for j in reversed(range(1, len_2 + 1)):
162
    for i in reversed(range(1, len_1 + 1)):
163
      gap_y_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 3] +
164
                       gap_y_e[i+1, j] * gap_y_p[i+1, j, 2])
165

166
      gap_x_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 2] +
167
                       gap_x_e[i, j+1] * gap_x_p[i, j+1, 1] +
168
                       gap_y_e[i+1, j] * gap_y_p[i+1, j, 1])
169

170
      match_e[i, j] = (match_e[i+1, j+1] * match_p[i+1, j+1, 1] +
171
                       gap_x_e[i, j+1] * gap_x_p[i, j+1, 0] +
172
                       gap_y_e[i+1, j] * gap_y_p[i+1, j, 0] +
173
                       probas[i, j])
174

175
  g_sim_mat = np.zeros_like(sim_mat)
176
  g_gap_open = np.zeros_like(sim_mat)
177
  g_gap_extend = np.zeros_like(sim_mat)
178
  for i in range(1, len_1 + 1):
179
    for j in range(1, len_2 + 1):
180
      g_sim_mat[i-1, j-1] = match_e[i, j]
181
      g_gap_open[i-1, j-1] = (gap_x_e[i, j+1] * (-gap_x_p[i, j+1, 0]) +
182
                              gap_y_e[i+1, j] * (-gap_y_p[i+1, j, 0] -
183
                                                 gap_y_p[i+1, j, 1]))
184
      g_gap_extend[i-1, j-1] = (gap_x_e[i, j+1] * (-gap_x_p[i, j+1, 1]) +
185
                                gap_y_e[i+1, j] * (-gap_y_p[i+1, j, 2]))
186

187
  return value, g_sim_mat, np.sum(g_gap_open), np.sum(g_gap_extend)
188

189

190
def soft_sw_affine(sim_mat,
191
                   gap_open,
192
                   gap_extend,
193
                   temperature = 1.0,
194
                   ret_grads = False):
195
  """Computes soft Smith-Waterman with affine gaps.
196

197
  Args:
198
    sim_mat: a np.ndarray<float>[batch, len1, len2] the substitution
199
     values for pairs of sequences.
200
    gap_open: a np.ndarray<float>[batch] of penalty in the substitution values
201
     for opening a gap.
202
    gap_extend: a np.ndarray<float>[batch] of penalty in the substitution values
203
     for extending a gap.
204
    temperature: float controlling the regularization strength.
205
    ret_grads: whether to return gradients or not.
206

207
  Returns:
208
    values if ret_grads is False
209
    values, g_sim_mat, g_gap_open, g_gap_extend if ret_grads is True
210

211
    values = np.ndarray<float>[batch] of softmax values
212
    g_sim_mat = np.ndarray<float>[batch, len_1, len_2]
213
    g_gap_open = np.ndarray<float>[batch]
214
    g_gap_extend = np.ndarray<float>[batch]
215
  """
216
  # TODO(mblondel): avoid naive for loop.
217
  arr = [_soft_sw_affine(sim_mat[i], gap_open[i], gap_extend[i],
218
                         temperature=temperature, ret_grads=ret_grads)
219
         for i in range(sim_mat.shape[0])]
220
  if ret_grads:
221
    return tuple(np.array(elements) for elements in zip(*arr))
222
  else:
223
    return np.array(arr)
224
google-research

Использование cookies