google-research
122 строки · 4.4 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Functions for pipeline functions for picking aptamers using trained models.
17"""
18
19import os20
21# Google internal
22import gfile23
24
25# Set a seed so we can re-run the selection and get the same results.
26RANDOM_SEED = 1215200727
28
29class Error(Exception):30pass31
32
33def seq_to_array_seq(seq, array_len=60):34"""Adjust a sequence to make it ready for the microarray.35
36Sequences are provided 5' to 3' but printed with the 3' end attached. If
37the sequence is less than the maximum allowed on the array (60 for our
38standard Agilent microarrays), we buffer the sequence out to 60 bases
39using T's to raise it up off the plate.
40
41Args:
42seq: String sequence to put on the microarray.
43array_len: Maximum sequence length to print on the microarray.
44Raises:
45Error: If seq is longer than the array_len.
46Returns:
47String sequence ready for putting in an Agilent txt file for array ordering.
48"""
49if len(seq) > array_len:50raise Error('Sequence is too long for the array. Max of %d but %s is %d.' %51(array_len, seq, len(seq)))52return '%s%s' % (seq, 'T' * (array_len - len(seq)))53
54
55def collapse_and_write(choice_protos,56output_base,57array_name,58array_prefix,59copies_per_seq=10):60"""Writes choice_protos out and a collapsed file of sequences for an array.61
62Args:
63choice_protos: List of Choice protos to save.
64output_base: String base directory to save the files in.
65array_name: String name of the file to save the sequences. The file will
66be in the right format to be uploaded to the Agilent website.
67array_prefix: The string to append to the front of every spot name,
68used to keep probe ids unique on the Agilent website.
69copies_per_seq: Integer number of spots to put on the array for each
70sequence.
71Returns:
72Set of unique sequences in the choice_protos.
73"""
74count_proto = 075count_seq = 076count_spot = 077
78# There will be duplicate sequences in choice_protos. Write each seq once.79previous_seqs = set()80with gfile.GFile(os.path.join(output_base, array_name), 'w') as f:81for p in choice_protos:82count_proto += 183seq = p.aptamer_sequence84if seq in previous_seqs:85continue86previous_seqs.add(seq)87count_seq += 188for i in range(copies_per_seq):89count_spot += 190probe_id = '%s_%s_%d_of_%d' % (array_prefix, seq, i + 1, copies_per_seq)91f.write(92('%s\t%s\n' % (probe_id, seq_to_array_seq(seq))).encode('utf-8'))93print(('There are %d protos with %d unique sequences, yielding %d array '94'spots' % (count_proto, count_seq, count_spot)))95
96return previous_seqs97
98
99def write_oligos(sequences, output_base, oligo_pool_name, fwd_primer,100rev_primer):101"""Writes sequences out to a file format for ordering an oligo pool.102
103Args:
104sequences: An iteratable collection of string sequences.
105output_base: String base directory to save the files in.
106oligo_pool_name: String name of the file to save the sequences to send
107to CustomArray to order an oligo pool. These sequences do not have
108duplicates printed and have forward and reverse primers.
109fwd_primer: String to be appended to the front of the sequence for the
110oligo pool sequences
111rev_primer: String to be appended to the back of the sequence for the
112oligo pool sequences. Final oligo pool sequence for an aptamer sequence
113'seq' will be '%s%s%s' % (fwd_primer, seq, rev_primer).
114"""
115
116# For CustomArray, we just want a list of sequences but include primers117count_oligo = 0118with gfile.GFile(os.path.join(output_base, oligo_pool_name), 'w') as f:119for seq in sequences:120count_oligo += 1121f.write(('%s%s%s\n' % (fwd_primer, seq, rev_primer)).encode('utf-8'))122print('Wrote %d oligos to a txt file.' % (count_oligo))123