google-research

config.py
185 строк · 5.3 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""This file holds constants we reuse across the train and eval scripts."""
17

18
# The order of these bases is crucial in converting to/from the one hot array.
19
# The order here must match the order for 'bases' in custom_ops.cc.
20
ORDERED_BASES = 'ATGC'
21

22
# Configuration for standard filtering of aptamer reads.
23
EXPECTED_APTAMER_LENGTH = 40
24
MIN_BASE_QUALITY = 20
25
MAX_BAD_BASES = 5
26
MIN_AVG_READ_QUALITY = 30.0
27
MAX_PAIR_DISSIMILARITY = 5
28

29
tr_report_name = 'tr_report.json'
30
eval_report_name = 'eval_report.json'
31

32
wetlab_experiment_train_name = 'wetlab_experiment_train.pbtxt'
33
wetlab_experiment_val_name = 'wetlab_experiment_val.pbtxt'
34

35
hparams_name = 'hparams.pbtxt'
36

37
val_fold = 0
38
num_folds = 5
39

40
# TensorFlow experiment-related constants
41
experiment_training_dir = 'eval-training'
42
experiment_validation_dir = 'eval-validation'
43
experiment_report_name = 'report.nc'
44
experiment_best_report_name = 'best_model.nc'
45

46

47
def get_wetlab_experiment_train_pbtxt_path(fold):
48
  """Returns the basename of the training proto.
49

50
  Args:
51
    fold: The integer fold of interest.
52

53
  Returns:
54
    The basename of the training proto.
55

56
  Raises:
57
    ValueError: The requested fold is invalid.
58
  """
59
  if not 0 <= fold < num_folds:
60
    raise ValueError('Invalid fold: %i' % fold)
61
  return 'experiment_fold_%i_train.pbtxt' % fold
62

63

64
def get_wetlab_experiment_val_pbtxt_path(fold, template=''):
65
  """Returns the basename of the validation proto.
66

67
  Args:
68
    fold: The integer fold of interest.
69
    template: An optional string to change the name of the proto used.
70

71
  Returns:
72
    The basename of the validation proto.
73

74
  Raises:
75
    ValueError: The requested fold is invalid.
76
  """
77
  if not 0 <= fold < num_folds:
78
    raise ValueError('Invalid fold: %i' % fold)
79
  return 'experiment_%sfold_%i_test.pbtxt' % (template, fold)
80

81

82
def get_example_sstable_path(fold, template=''):
83
  """Returns the basename of the SSTable fold.
84

85
  Args:
86
    fold: The integer fold of interest.
87
    template: An optional string to change the name of the SSTable used.
88

89
  Returns:
90
    The basename of the SSTable.
91

92
  Raises:
93
    ValueError: The requested fold is invalid.
94
  """
95
  if not 0 <= fold < num_folds:
96
    raise ValueError('Invalid fold: %i' % fold)
97
  return 'examples_%sfold_%i.sstable' % (template, fold)
98

99

100
def get_hdf5_path(fold, template=''):
101
  """Returns the basename of the HDF5 representation of the fold data.
102

103
  Args:
104
    fold: The integer fold of interest.
105
    template: An optional string to change the name of the HDF5 file used.
106

107
  Returns:
108
    The basename of the HDF5 file.
109

110
  Raises:
111
    ValueError: The requested fold is invalid.
112
  """
113
  if not 0 <= fold < num_folds:
114
    raise ValueError('Invalid fold: %i' % fold)
115
  return 'table_%sfold_%i.h5' % (template, fold)
116

117

118
wetlab_experiment_train_pbtxt_path = [
119
    get_wetlab_experiment_train_pbtxt_path(n) for n in range(num_folds)
120
]
121

122
wetlab_experiment_val_pbtxt_path = [
123
    get_wetlab_experiment_val_pbtxt_path(n) for n in range(num_folds)
124
]
125

126
example_sstable_paths = [get_example_sstable_path(n) for n in range(num_folds)]
127

128
hdf5_paths = [get_hdf5_path(n) for n in range(num_folds)]
129

130
# Directories where commonly-used input data reside.
131
_BASEDIR = 'xxx'
132
INPUT_DATA_DIRS = {
133
    'xxx':
134
        _BASEDIR + 'xxx/paired/low_quality/folds',
135
    'aptitude':
136
        _BASEDIR + 'aptitude/r=3/fastq/processed4/folds',
137
}
138

139
# Target counts used to compute affinity in the fully observed model for
140
# each dataset (see predict_affinity in FullyObserved output_layers).
141
# These maps aren't used in training but are used in inference when an affinity
142
# is desired.
143
#
144
# For each dataset, we define a dictionary where each key is a selection
145
# affinity molecule (e.g. the protein used in selection) and each value is a
146
# tuple of predicted target output names to use when calculating affinity.
147
DEFAULT_AFFINITY_TARGET_MAPS = {
148
    'xxx': {
149
        'xxx': ['round5_murine'],
150
        'xxx': ['round5'],
151
        'xxx': ['round5_igg']
152
    },
153
    'aptitude': {
154
        'target': [
155
            'round2_high_no_serum_positive', 'round2_medium_no_serum_positive'
156
        ],
157
        'serum': [
158
            'round2_high_with_serum_positive',
159
            'round2_medium_with_serum_positive'
160
        ]
161
    },
162
    'aptitude_binned': {
163
        'target': [
164
            'low_3bins',
165
            'med_3bins',
166
            'high_3bins',
167
        ],
168
    },
169
    'aptitude_super_binned': {
170
        'target': ['super_bin',],
171
    },
172
    'aptitudecluster': {
173
        'target': [
174
            'round2_high_no_serum_positive', 'round2_medium_no_serum_positive'
175
        ],
176
        'serum': [
177
            'round2_high_with_serum_positive',
178
            'round2_medium_with_serum_positive'
179
        ]
180
    },
181
    'for_testing': {
182
        'proteinA': ['round2_A_count'],
183
        'proteinB': ['round1_B_count', 'round2_B_count']
184
    },
185
}
186
google-research

Использование cookies