google-research

Форк
0
/
search_pipeline.py 
122 строки · 4.4 Кб
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Functions for pipeline functions for picking aptamers using trained models.
17
"""
18

19
import os
20

21
# Google internal
22
import gfile
23

24

25
# Set a seed so we can re-run the selection and get the same results.
26
RANDOM_SEED = 12152007
27

28

29
class Error(Exception):
30
  pass
31

32

33
def seq_to_array_seq(seq, array_len=60):
34
  """Adjust a sequence to make it ready for the microarray.
35

36
  Sequences are provided 5' to 3' but printed with the 3' end attached. If
37
  the sequence is less than the maximum allowed on the array (60 for our
38
  standard Agilent microarrays), we buffer the sequence out to 60 bases
39
  using T's to raise it up off the plate.
40

41
  Args:
42
    seq: String sequence to put on the microarray.
43
    array_len: Maximum sequence length to print on the microarray.
44
  Raises:
45
    Error: If seq is longer than the array_len.
46
  Returns:
47
    String sequence ready for putting in an Agilent txt file for array ordering.
48
  """
49
  if len(seq) > array_len:
50
    raise Error('Sequence is too long for the array. Max of %d but %s is %d.' %
51
                (array_len, seq, len(seq)))
52
  return '%s%s' % (seq, 'T' * (array_len - len(seq)))
53

54

55
def collapse_and_write(choice_protos,
56
                       output_base,
57
                       array_name,
58
                       array_prefix,
59
                       copies_per_seq=10):
60
  """Writes choice_protos out and a collapsed file of sequences for an array.
61

62
  Args:
63
    choice_protos: List of Choice protos to save.
64
    output_base: String base directory to save the files in.
65
    array_name: String name of the file to save the sequences. The file will
66
      be in the right format to be uploaded to the Agilent website.
67
    array_prefix: The string to append to the front of every spot name,
68
      used to keep probe ids unique on the Agilent website.
69
    copies_per_seq: Integer number of spots to put on the array for each
70
      sequence.
71
  Returns:
72
    Set of unique sequences in the choice_protos.
73
  """
74
  count_proto = 0
75
  count_seq = 0
76
  count_spot = 0
77

78
  # There will be duplicate sequences in choice_protos. Write each seq once.
79
  previous_seqs = set()
80
  with gfile.GFile(os.path.join(output_base, array_name), 'w') as f:
81
    for p in choice_protos:
82
      count_proto += 1
83
      seq = p.aptamer_sequence
84
      if seq in previous_seqs:
85
        continue
86
      previous_seqs.add(seq)
87
      count_seq += 1
88
      for i in range(copies_per_seq):
89
        count_spot += 1
90
        probe_id = '%s_%s_%d_of_%d' % (array_prefix, seq, i + 1, copies_per_seq)
91
        f.write(
92
            ('%s\t%s\n' % (probe_id, seq_to_array_seq(seq))).encode('utf-8'))
93
  print(('There are %d protos with %d unique sequences, yielding %d array '
94
         'spots' % (count_proto, count_seq, count_spot)))
95

96
  return previous_seqs
97

98

99
def write_oligos(sequences, output_base, oligo_pool_name, fwd_primer,
100
                 rev_primer):
101
  """Writes sequences out to a file format for ordering an oligo pool.
102

103
  Args:
104
    sequences: An iteratable collection of string sequences.
105
    output_base: String base directory to save the files in.
106
    oligo_pool_name: String name of the file to save the sequences to send
107
      to CustomArray to order an oligo pool. These sequences do not have
108
      duplicates printed and have forward and reverse primers.
109
    fwd_primer: String to be appended to the front of the sequence for the
110
      oligo pool sequences
111
    rev_primer: String to be appended to the back of the sequence for the
112
      oligo pool sequences. Final oligo pool sequence for an aptamer sequence
113
      'seq' will be '%s%s%s' % (fwd_primer, seq, rev_primer).
114
  """
115

116
  # For CustomArray, we just want a list of sequences but include primers
117
  count_oligo = 0
118
  with gfile.GFile(os.path.join(output_base, oligo_pool_name), 'w') as f:
119
    for seq in sequences:
120
      count_oligo += 1
121
      f.write(('%s%s%s\n' % (fwd_primer, seq, rev_primer)).encode('utf-8'))
122
  print('Wrote %d oligos to a txt file.' % (count_oligo))
123

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.