google-research
136 строк · 4.2 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Utilities to manipulate and represent ground-truth alignments as strings."""
17
18import re19from typing import List, Tuple20
21
22def alignment_from_gapped_sequences(23gapped_sequence_x,24gapped_sequence_y,25):26"""Extracts positions of match states in gapped sequences `x` and `y`."""27matches = []28ptr_x, ptr_y = 1, 1 # One-based indexing.29for c_x, c_y in zip(gapped_sequence_x, gapped_sequence_y):30if c_x.isupper() and c_y.isupper():31matches.append((ptr_x, ptr_y))32ptr_x += c_x.isalpha()33ptr_y += c_y.isalpha()34
35# Retrieves (one-based) starting position of the alignment.36ali_start_x = matches[0][0] if matches else 037ali_start_y = matches[0][1] if matches else 038# Normalizes the indices of matching positions relative to the start of the39# alignment.40matches = [(x - ali_start_x, y - ali_start_y) for x, y in matches]41
42return matches, (ali_start_x, ali_start_y)43
44
45def states_from_matches(46matches,47bos = 'S',48):49"""Generates (uncompressed) alignment `states` string from `matches`."""50states = []51for match, next_match in zip(matches[:-1], matches[1:]):52states.append('M')53num_gap_in_x = next_match[1] - match[1] - 154while num_gap_in_x:55states.append('X')56num_gap_in_x -= 157num_gap_in_y = next_match[0] - match[0] - 158while num_gap_in_y:59states.append('Y')60num_gap_in_y -= 161if matches: # Adds last match of alignment, if non-empty.62states.append('M')63return ''.join([bos] + states)64
65
66def compress_states(states):67"""Run-length encoding compression of alignment `states` string."""68chunks = []69last_state = states[0]70num_occurrences = 171for state in states[1:]:72if state != last_state:73chunks.append(f'{num_occurrences}{last_state}')74last_state = state75num_occurrences = 076num_occurrences += 177# Adds chunk corresponding to last stretch of identical chars.78return ''.join(chunks + [f'{num_occurrences}{last_state}'])79
80
81def pid_from_matches(82sequence_x,83sequence_y,84matches,85ali_start_x = 1,86ali_start_y = 1,87):88"""Computes the PID of sequences `x` and `y` from their alignment."""89num_identical = 090alignment_length = 091for match, next_match in zip(matches[:-1], matches[1:]):92c_x = sequence_x[ali_start_x + match[0] - 1]93c_y = sequence_y[ali_start_y + match[1] - 1]94num_identical += c_x == c_y95alignment_length += 196alignment_length += next_match[0] - match[0] - 197alignment_length += next_match[1] - match[1] - 198if matches: # Processes last match of alignment, if non-empty.99c_x = sequence_x[ali_start_x + matches[-1][0] - 1]100c_y = sequence_y[ali_start_y + matches[-1][1] - 1]101num_identical += c_x == c_y102alignment_length += 1103return num_identical / max(1, alignment_length)104
105
106def gapped_sequence_from_cigar(107sequence,108cigar,109seq_start = 1,110gap = '.',111is_reference = True,112):113"""Adds gaps to `sequence` according to input `cigar` string."""114insert = 'D' if is_reference else 'I'115delete = 'I' if is_reference else 'D'116
117start_pos = seq_start - 1 # One-based indexing.118chunk_len = None119
120output_chunks = []121for field in re.split(r'([DIM])', cigar):122if field.isnumeric():123chunk_len = int(field)124elif field: # skips empty strings.125chunk_len = chunk_len or 1126if field == delete:127output_chunks.append(chunk_len * gap)128elif field in ('M', insert):129end_pos = start_pos + chunk_len130output_chunks.append(sequence[start_pos:end_pos])131start_pos = end_pos132else:133raise ValueError(f'CIGAR string {cigar} is invalid.')134chunk_len = None135
136return ''.join(output_chunks)137