google-research
118 строк · 3.4 Кб
1// Copyright 2024 The Google Research Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Reads DNA sequences from the keys of an sstable and write another sstable
16// with DNA sequence keys and values given by N-gram GenericFeatureVectors.
17//
18// Since the DNA alphabet has only four characters, and typical N-gram sizes are
19// 4-6, we use a simple mathematical formula to map N-grams to sparse dimensions
20// instead of hashing. We simply encode each character in the N-gram as two
21// bits:
22//
23// A = 00
24// C = 01
25// G = 10
26// T = 11
27//
28// Then, for example, the four-gram TCAG would be encoded as:
29//
30// (Leading zeroes) 11 01 00 10
31//
32// This allows up to 32-grams to be used, which should be more than sufficient.
33#include "xxx/preprocess/scam_featurize.h"
34
35#include <vector>
36
37#include "absl/strings/string_view.h"
38#include "xxx/scam/data_format/features.proto.h"
39#include "xxx/scam/utils/gfv_normalization.h"
40
41namespace research_biology {
42namespace aptamers {
43
44namespace {
45
46std::vector<uint32> SequenceToNgrams(absl::string_view dna, size_t ngram_size) {
47std::vector<uint32> result;
48for (size_t i = 0; i + ngram_size <= dna.ssize(); ++i) {
49uint32 dim = 0;
50for (size_t j = 0; j < ngram_size; ++j) {
51const char c = dna[i + j];
52switch (c) {
53case 'A':
54case 'a':
55break; // 0
56case 'C':
57case 'c':
58dim |= (1 << (2 * j));
59break;
60case 'G':
61case 'g':
62dim |= (2 << (2 * j));
63break;
64case 'T':
65case 't':
66dim |= (3 << (2 * j));
67break;
68default:
69LOG(FATAL) << "Non-DNA Character: " << c;
70}
71}
72
73result.push_back(dim);
74}
75
76return result;
77}
78
79research_scam::GenericFeatureVector DnaToGfv(absl::string_view dna) {
80const size_t ngram_size = 6;
81research_scam::GenericFeatureVector gfv;
82std::vector<uint32> dims = SequenceToNgrams(dna, ngram_size);
83std::sort(dims.begin(), dims.end());
84gfv.set_feature_dim(std::pow(4, ngram_size));
85CHECK(!dims.empty()) << "No valid N-grams for sequence: " << dna;
86gfv.add_feature_index(dims[0]);
87gfv.add_feature_value_float(1);
88for (size_t i = 1; i < dims.size(); ++i) {
89if (dims[i] == dims[i - 1]) {
90++(*gfv.mutable_feature_value_float()->rbegin());
91} else {
92gfv.add_feature_index(dims[i]);
93gfv.add_feature_value_float(1);
94}
95}
96
97gfv.set_feature_type(research_scam::GenericFeatureVector::FLOAT);
98return research_scam::NormalizeUnitL2(gfv);
99}
100
101class FeaturizeFn : public flume::MapFn<string, FeatureVectorEntry> {
102public:
103FeatureVectorEntry Map(const string& dna) override {
104return flume::make_kv(dna, DnaToGfv(dna));
105}
106
107private:
108REGISTER_AS_STATELESS_FN(FeaturizeFn);
109};
110
111} // namespace
112
113FeatureVectorTable FeaturizeSequences(const SequenceCollection& in) {
114return in.ParDo("featurize_dna", new FeaturizeFn());
115}
116
117} // namespace aptamers
118} // namespace research_biology
119