google-research

Форк
0
/
language_utils.py 
195 строк · 5.1 Кб
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Utilities for language."""
17
# pylint: disable=not-an-iterable
18

19
from __future__ import absolute_import
20
from __future__ import division
21
from __future__ import print_function
22

23
import random
24
import re
25

26
import numpy as np
27

28

29
_RELATION_SYNONYMS = {
30
    'on the left side': ['left', 'on the left'],
31
    'on the right side': ['right', 'on the right'],
32
    'in front of': ['front of']
33
}
34
_MATERIAL_SYNONYMS = {
35
    'matte': ['rubber', ''],
36
    'rubber': ['matte', ''],
37
    'shiny': ['metallic', ''],
38
    'metallic': ['shiny', '']
39
}
40
_OBJECT_SYNONYMS = {
41
    'object': ['sphere', 'object', 'thing'],
42
    'sphere': ['object', 'ball', 'thing'],
43
    'ball': ['sphere', 'object', 'thing'],
44
    'objects': ['spheres', 'objects', 'things'],
45
    'spheres': ['objects', 'balls', 'things'],
46
    'balls': ['spheres', 'objects', 'things']
47
}
48
_ADJECTIVE_SYNONYMS = {'any': ['']}
49
_MISC_SYNONYMS = {'are': ['is']}
50

51
_CLEVR_SYNONYM_TABLES = [
52
    _RELATION_SYNONYMS, _MATERIAL_SYNONYMS, _OBJECT_SYNONYMS,
53
    _ADJECTIVE_SYNONYMS, _MISC_SYNONYMS
54
]
55

56
_COLORS = [
57
    {
58
        'red': ['']
59
    },
60
    {
61
        'blue': ['']
62
    },
63
    {
64
        'cyan': ['']
65
    },
66
    {
67
        'purple': ['']
68
    },
69
    {
70
        'green': ['']
71
    },
72
]
73

74
_OTHER_COLORS = {
75
    'red': ['blue', 'cyan', 'purple', 'green'],
76
    'blue': ['red', 'cyan', 'purple', 'green'],
77
    'cyan': ['blue', 'red', 'purple', 'green'],
78
    'purple': ['blue', 'cyan', 'red', 'green'],
79
    'green': ['blue', 'cyan', 'purple', 'red'],
80
}
81

82
_OTHER_DIRECTIONS = {
83
    'left': ['right'],
84
    'right': ['left'],
85
    'front': ['behind'],
86
    'behind': ['front'],
87
}
88

89

90
def get_vocab_path(cfg):
91
  """Get path to the list of vocabularies."""
92
  vocab_path = None
93
  if not vocab_path:
94
    vocab_path = cfg.vocab_path
95
  return vocab_path
96

97

98
def instruction_type(instruction):
99
  if len(instruction) < 40:
100
    return 'unary'
101
  else:
102
    return 'regular'
103

104

105
def pad_to_max_length(data, max_l=None, eos_token=0):
106
  """Pad a list of sequence to the maximum length."""
107
  eos = eos_token
108
  if not max_l:
109
    max_l = -1
110
    for p in data:
111
      max_l = max(max_l, len(p))
112
  data_padded = []
113
  for p in data:
114
    if len(p) == max_l:
115
      data_padded.append(list(p))
116
    else:
117
      p = list(p) + [eos] * (max_l - len(p))
118
      data_padded.append(p)
119
  return np.array(data_padded)
120

121

122
def pad_sequence(data, max_l=None, eos_token=0):
123
  """Pad a sequence to max_l with eos_token."""
124
  eos = eos_token
125
  if len(data) == max_l:
126
    return np.array(data)
127
  elif len(data) > max_l:
128
    raise ValueError('data longer than max_l')
129
  else:
130
    data = list(data) + [eos] * (max_l - len(data))
131
    return np.array(data)
132

133

134
def paraphrase_sentence(text, synonym_tables=None, delete_color=False, k=2):
135
  """Paraphrase a sentence.
136

137
  Args:
138
    text: text to be paraphrased
139
    synonym_tables: a table that contains synonyms for all the words
140
    delete_color: whether to delete colors from sentences
141
    k: number of words to replace
142

143
  Returns:
144
    paraphrased text
145
  """
146
  if not synonym_tables:
147
    synonym_tables = _CLEVR_SYNONYM_TABLES
148
  tables = random.sample(synonym_tables, k)
149
  if delete_color and random.uniform(0, 1) < 0.5:
150
    tables = random.sample(_COLORS, 5)
151
    subed = False
152
    for t in tables:
153
      if subed:
154
        break
155
      for w in t:
156
        if w in text:
157
          text = re.sub(w, random.choice(t[w]), text)
158
          subed = True
159
  else:
160
    for t in tables:
161
      for w in t:
162
        if w in text:
163
          text = re.sub(w, random.choice(t[w]), text)
164
  return text
165

166

167
def negate_unary_sentence(text):
168
  """Negate a instruction involving a single object."""
169
  words = text.split(' ')
170
  mutate_candiate = {}
171
  for i, w in enumerate(words):
172
    if w in _OTHER_COLORS:
173
      mutate_candiate['color'] = (i, w)
174
    elif w in _OTHER_DIRECTIONS:
175
      mutate_candiate['direction'] = (i, w)
176
  toss = random.random()
177
  if toss < 0.33 and 'color' in mutate_candiate:
178
    i, color = mutate_candiate['color']
179
    new_color = random.choice(_OTHER_COLORS[color])
180
    words[i] = new_color
181
  elif 0.33 < random.random() < 0.66 and 'direction' in mutate_candiate:
182
    i, direction = mutate_candiate['direction']
183
    new_direction = random.choice(_OTHER_DIRECTIONS[direction])
184
    words[i] = new_direction
185
  elif 'direction' in mutate_candiate and 'color' in mutate_candiate:
186
    i, color = mutate_candiate['color']
187
    new_color = random.choice(_OTHER_COLORS[color])
188
    words[i] = new_color
189
    i, direction = mutate_candiate['direction']
190
    new_direction = random.choice(_OTHER_DIRECTIONS[direction])
191
    words[i] = new_direction
192
  else:
193
    return None
194
  mutated_text = ' '.join(words)
195
  return mutated_text
196

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.