google-research

scam_featurize.cc
118 строк · 3.4 Кб
Перенос по словам
1
// Copyright 2024 The Google Research Authors.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
// Reads DNA sequences from the keys of an sstable and write another sstable
16
// with DNA sequence keys and values given by N-gram GenericFeatureVectors.
17
//
18
// Since the DNA alphabet has only four characters, and typical N-gram sizes are
19
// 4-6, we use a simple mathematical formula to map N-grams to sparse dimensions
20
// instead of hashing. We simply encode each character in the N-gram as two
21
// bits:
22
//
23
// A = 00
24
// C = 01
25
// G = 10
26
// T = 11
27
//
28
// Then, for example, the four-gram TCAG would be encoded as:
29
//
30
// (Leading zeroes) 11 01 00 10
31
//
32
// This allows up to 32-grams to be used, which should be more than sufficient.
33
#include "xxx/preprocess/scam_featurize.h"
34

35
#include <vector>
36

37
#include "absl/strings/string_view.h"
38
#include "xxx/scam/data_format/features.proto.h"
39
#include "xxx/scam/utils/gfv_normalization.h"
40

41
namespace research_biology {
42
namespace aptamers {
43

44
namespace {
45

46
std::vector<uint32> SequenceToNgrams(absl::string_view dna, size_t ngram_size) {
47
  std::vector<uint32> result;
48
  for (size_t i = 0; i + ngram_size <= dna.ssize(); ++i) {
49
    uint32 dim = 0;
50
    for (size_t j = 0; j < ngram_size; ++j) {
51
      const char c = dna[i + j];
52
      switch (c) {
53
        case 'A':
54
        case 'a':
55
          break;  // 0
56
        case 'C':
57
        case 'c':
58
          dim |= (1 << (2 * j));
59
          break;
60
        case 'G':
61
        case 'g':
62
          dim |= (2 << (2 * j));
63
          break;
64
        case 'T':
65
        case 't':
66
          dim |= (3 << (2 * j));
67
          break;
68
        default:
69
          LOG(FATAL) << "Non-DNA Character:  " << c;
70
      }
71
    }
72

73
    result.push_back(dim);
74
  }
75

76
  return result;
77
}
78

79
research_scam::GenericFeatureVector DnaToGfv(absl::string_view dna) {
80
  const size_t ngram_size = 6;
81
  research_scam::GenericFeatureVector gfv;
82
  std::vector<uint32> dims = SequenceToNgrams(dna, ngram_size);
83
  std::sort(dims.begin(), dims.end());
84
  gfv.set_feature_dim(std::pow(4, ngram_size));
85
  CHECK(!dims.empty()) << "No valid N-grams for sequence: " << dna;
86
  gfv.add_feature_index(dims[0]);
87
  gfv.add_feature_value_float(1);
88
  for (size_t i = 1; i < dims.size(); ++i) {
89
    if (dims[i] == dims[i - 1]) {
90
      ++(*gfv.mutable_feature_value_float()->rbegin());
91
    } else {
92
      gfv.add_feature_index(dims[i]);
93
      gfv.add_feature_value_float(1);
94
    }
95
  }
96

97
  gfv.set_feature_type(research_scam::GenericFeatureVector::FLOAT);
98
  return research_scam::NormalizeUnitL2(gfv);
99
}
100

101
class FeaturizeFn : public flume::MapFn<string, FeatureVectorEntry> {
102
 public:
103
  FeatureVectorEntry Map(const string& dna) override {
104
    return flume::make_kv(dna, DnaToGfv(dna));
105
  }
106

107
 private:
108
  REGISTER_AS_STATELESS_FN(FeaturizeFn);
109
};
110

111
}  // namespace
112

113
FeatureVectorTable FeaturizeSequences(const SequenceCollection& in) {
114
  return in.ParDo("featurize_dna", new FeaturizeFn());
115
}
116

117
}  // namespace aptamers
118
}  // namespace research_biology
119
google-research

Использование cookies