google-research

alignment.py
136 строк · 4.2 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Utilities to manipulate and represent ground-truth alignments as strings."""
17

18
import re
19
from typing import List, Tuple
20

21

22
def alignment_from_gapped_sequences(
23
    gapped_sequence_x,
24
    gapped_sequence_y,
25
):
26
  """Extracts positions of match states in gapped sequences `x` and `y`."""
27
  matches = []
28
  ptr_x, ptr_y = 1, 1  # One-based indexing.
29
  for c_x, c_y in zip(gapped_sequence_x, gapped_sequence_y):
30
    if c_x.isupper() and c_y.isupper():
31
      matches.append((ptr_x, ptr_y))
32
    ptr_x += c_x.isalpha()
33
    ptr_y += c_y.isalpha()
34

35
  # Retrieves (one-based) starting position of the alignment.
36
  ali_start_x = matches[0][0] if matches else 0
37
  ali_start_y = matches[0][1] if matches else 0
38
  # Normalizes the indices of matching positions relative to the start of the
39
  # alignment.
40
  matches = [(x - ali_start_x, y - ali_start_y) for x, y in matches]
41

42
  return matches, (ali_start_x, ali_start_y)
43

44

45
def states_from_matches(
46
    matches,
47
    bos = 'S',
48
):
49
  """Generates (uncompressed) alignment `states` string from `matches`."""
50
  states = []
51
  for match, next_match in zip(matches[:-1], matches[1:]):
52
    states.append('M')
53
    num_gap_in_x = next_match[1] - match[1] - 1
54
    while num_gap_in_x:
55
      states.append('X')
56
      num_gap_in_x -= 1
57
    num_gap_in_y = next_match[0] - match[0] - 1
58
    while num_gap_in_y:
59
      states.append('Y')
60
      num_gap_in_y -= 1
61
  if matches:  # Adds last match of alignment, if non-empty.
62
    states.append('M')
63
  return ''.join([bos] + states)
64

65

66
def compress_states(states):
67
  """Run-length encoding compression of alignment `states` string."""
68
  chunks = []
69
  last_state = states[0]
70
  num_occurrences = 1
71
  for state in states[1:]:
72
    if state != last_state:
73
      chunks.append(f'{num_occurrences}{last_state}')
74
      last_state = state
75
      num_occurrences = 0
76
    num_occurrences += 1
77
  # Adds chunk corresponding to last stretch of identical chars.
78
  return ''.join(chunks + [f'{num_occurrences}{last_state}'])
79

80

81
def pid_from_matches(
82
    sequence_x,
83
    sequence_y,
84
    matches,
85
    ali_start_x = 1,
86
    ali_start_y = 1,
87
):
88
  """Computes the PID of sequences `x` and `y` from their alignment."""
89
  num_identical = 0
90
  alignment_length = 0
91
  for match, next_match in zip(matches[:-1], matches[1:]):
92
    c_x = sequence_x[ali_start_x + match[0] - 1]
93
    c_y = sequence_y[ali_start_y + match[1] - 1]
94
    num_identical += c_x == c_y
95
    alignment_length += 1
96
    alignment_length += next_match[0] - match[0] - 1
97
    alignment_length += next_match[1] - match[1] - 1
98
  if matches:  # Processes last match of alignment, if non-empty.
99
    c_x = sequence_x[ali_start_x + matches[-1][0] - 1]
100
    c_y = sequence_y[ali_start_y + matches[-1][1] - 1]
101
    num_identical += c_x == c_y
102
    alignment_length += 1
103
  return num_identical / max(1, alignment_length)
104

105

106
def gapped_sequence_from_cigar(
107
    sequence,
108
    cigar,
109
    seq_start = 1,
110
    gap = '.',
111
    is_reference = True,
112
):
113
  """Adds gaps to `sequence` according to input `cigar` string."""
114
  insert = 'D' if is_reference else 'I'
115
  delete = 'I' if is_reference else 'D'
116

117
  start_pos = seq_start - 1  # One-based indexing.
118
  chunk_len = None
119

120
  output_chunks = []
121
  for field in re.split(r'([DIM])', cigar):
122
    if field.isnumeric():
123
      chunk_len = int(field)
124
    elif field:  # skips empty strings.
125
      chunk_len = chunk_len or 1
126
      if field == delete:
127
        output_chunks.append(chunk_len * gap)
128
      elif field in ('M', insert):
129
        end_pos = start_pos + chunk_len
130
        output_chunks.append(sequence[start_pos:end_pos])
131
        start_pos = end_pos
132
      else:
133
        raise ValueError(f'CIGAR string {cigar} is invalid.')
134
      chunk_len = None
135

136
  return ''.join(output_chunks)
137
google-research

Использование cookies