google-research
185 строк · 5.3 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""This file holds constants we reuse across the train and eval scripts."""
17
18# The order of these bases is crucial in converting to/from the one hot array.
19# The order here must match the order for 'bases' in custom_ops.cc.
20ORDERED_BASES = 'ATGC'
21
22# Configuration for standard filtering of aptamer reads.
23EXPECTED_APTAMER_LENGTH = 40
24MIN_BASE_QUALITY = 20
25MAX_BAD_BASES = 5
26MIN_AVG_READ_QUALITY = 30.0
27MAX_PAIR_DISSIMILARITY = 5
28
29tr_report_name = 'tr_report.json'
30eval_report_name = 'eval_report.json'
31
32wetlab_experiment_train_name = 'wetlab_experiment_train.pbtxt'
33wetlab_experiment_val_name = 'wetlab_experiment_val.pbtxt'
34
35hparams_name = 'hparams.pbtxt'
36
37val_fold = 0
38num_folds = 5
39
40# TensorFlow experiment-related constants
41experiment_training_dir = 'eval-training'
42experiment_validation_dir = 'eval-validation'
43experiment_report_name = 'report.nc'
44experiment_best_report_name = 'best_model.nc'
45
46
47def get_wetlab_experiment_train_pbtxt_path(fold):
48"""Returns the basename of the training proto.
49
50Args:
51fold: The integer fold of interest.
52
53Returns:
54The basename of the training proto.
55
56Raises:
57ValueError: The requested fold is invalid.
58"""
59if not 0 <= fold < num_folds:
60raise ValueError('Invalid fold: %i' % fold)
61return 'experiment_fold_%i_train.pbtxt' % fold
62
63
64def get_wetlab_experiment_val_pbtxt_path(fold, template=''):
65"""Returns the basename of the validation proto.
66
67Args:
68fold: The integer fold of interest.
69template: An optional string to change the name of the proto used.
70
71Returns:
72The basename of the validation proto.
73
74Raises:
75ValueError: The requested fold is invalid.
76"""
77if not 0 <= fold < num_folds:
78raise ValueError('Invalid fold: %i' % fold)
79return 'experiment_%sfold_%i_test.pbtxt' % (template, fold)
80
81
82def get_example_sstable_path(fold, template=''):
83"""Returns the basename of the SSTable fold.
84
85Args:
86fold: The integer fold of interest.
87template: An optional string to change the name of the SSTable used.
88
89Returns:
90The basename of the SSTable.
91
92Raises:
93ValueError: The requested fold is invalid.
94"""
95if not 0 <= fold < num_folds:
96raise ValueError('Invalid fold: %i' % fold)
97return 'examples_%sfold_%i.sstable' % (template, fold)
98
99
100def get_hdf5_path(fold, template=''):
101"""Returns the basename of the HDF5 representation of the fold data.
102
103Args:
104fold: The integer fold of interest.
105template: An optional string to change the name of the HDF5 file used.
106
107Returns:
108The basename of the HDF5 file.
109
110Raises:
111ValueError: The requested fold is invalid.
112"""
113if not 0 <= fold < num_folds:
114raise ValueError('Invalid fold: %i' % fold)
115return 'table_%sfold_%i.h5' % (template, fold)
116
117
118wetlab_experiment_train_pbtxt_path = [
119get_wetlab_experiment_train_pbtxt_path(n) for n in range(num_folds)
120]
121
122wetlab_experiment_val_pbtxt_path = [
123get_wetlab_experiment_val_pbtxt_path(n) for n in range(num_folds)
124]
125
126example_sstable_paths = [get_example_sstable_path(n) for n in range(num_folds)]
127
128hdf5_paths = [get_hdf5_path(n) for n in range(num_folds)]
129
130# Directories where commonly-used input data reside.
131_BASEDIR = 'xxx'
132INPUT_DATA_DIRS = {
133'xxx':
134_BASEDIR + 'xxx/paired/low_quality/folds',
135'aptitude':
136_BASEDIR + 'aptitude/r=3/fastq/processed4/folds',
137}
138
139# Target counts used to compute affinity in the fully observed model for
140# each dataset (see predict_affinity in FullyObserved output_layers).
141# These maps aren't used in training but are used in inference when an affinity
142# is desired.
143#
144# For each dataset, we define a dictionary where each key is a selection
145# affinity molecule (e.g. the protein used in selection) and each value is a
146# tuple of predicted target output names to use when calculating affinity.
147DEFAULT_AFFINITY_TARGET_MAPS = {
148'xxx': {
149'xxx': ['round5_murine'],
150'xxx': ['round5'],
151'xxx': ['round5_igg']
152},
153'aptitude': {
154'target': [
155'round2_high_no_serum_positive', 'round2_medium_no_serum_positive'
156],
157'serum': [
158'round2_high_with_serum_positive',
159'round2_medium_with_serum_positive'
160]
161},
162'aptitude_binned': {
163'target': [
164'low_3bins',
165'med_3bins',
166'high_3bins',
167],
168},
169'aptitude_super_binned': {
170'target': ['super_bin',],
171},
172'aptitudecluster': {
173'target': [
174'round2_high_no_serum_positive', 'round2_medium_no_serum_positive'
175],
176'serum': [
177'round2_high_with_serum_positive',
178'round2_medium_with_serum_positive'
179]
180},
181'for_testing': {
182'proteinA': ['round2_A_count'],
183'proteinB': ['round1_B_count', 'round2_B_count']
184},
185}
186