google-research

soft_attribute.py
208 строк · 6.8 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Library to extracts pairwise preferences from soft attribute data collection.
17

18
This library takes as input three-way bucketed data about soft attributes (with
19
items classified as about the same, more or less relative to an anchor item, see
20
https://github.com/google-research-datasets/soft-attributes). It processes this
21
data into more useful forms, using the
22
SoftAttributeJudgment class as the basic representation.
23
"""
24

25
import collections
26
import csv
27
import re
28
from typing import FrozenSet, Mapping, MutableMapping, Set, Text
29

30
import attr
31
from tensorflow.io import gfile
32

33

34
# eq=False since equality by value is meaningless (each entry is one rating).
35
@attr.s(auto_attribs=True, frozen=True)
36
class SoftAttributeJudgment:
37
  """Class to represent judgments made on soft attributes per https://github.com/google-research-datasets/soft-attributes.
38

39
  Each judgments consists of:
40
    - An attribute name
41
    - A rater id
42
    - A reference item name
43
    - Item names placed into each of the three buckets (less/same/more).
44
  """
45

46
  attribute: Text
47
  rater_id: int
48
  reference_item: Text
49
  less_items: FrozenSet[Text]
50
  same_items: FrozenSet[Text]
51
  more_items: FrozenSet[Text]
52

53

54
@attr.s(auto_attribs=True, frozen=True)
55
class PairwisePreference:
56
  """Class representing a pairwise preference between two items.
57

58
  If preference_strength is 0, it should be interpreted as:
59
    "Item <smaller_item> is ABOUT THE SAME <attribute> as <larger item>
60
  If preference_strength is 1, it should be interpreted as:
61
    "Item <smaller_item> is LESS <attribute> than <larger_item>.".
62
  If preference_strength is 2, it should be interpreted as:
63
    "Item <smaller_item> is MUCH LESS <attribute> than <larger_item>."
64
  """
65

66
  attribute: Text
67
  rater_id: int
68
  smaller_item: Text
69
  larger_item: Text
70
  preference_strength: int
71

72

73
def load_judgments(
74
    filename,
75
):
76
  """Loads ratings from filename, returns a dictionary of judgments.
77

78
  Args:
79
    filename: The name of CSV file with user ratings.
80

81
  Returns:
82
    A dictionary from attribute name to all judgments for that attribute.
83
  """
84

85
  def items_str_to_sets(s):
86
    # This is an awkward format sometimes in the data file:
87
    #  ["item,with,commas","item2","item3"]
88
    if s.startswith("[") and s.endswith("]"):
89
      return frozenset(match for match in re.findall(str_split_regex, s))
90
    return frozenset(filter(None, s.split(sep=",")))
91

92
  judgments: MutableMapping[Text, Set[SoftAttributeJudgment]] = (
93
      collections.defaultdict(set)
94
  )
95
  with gfile.GFile(filename, "r") as csv_file:
96
    str_split_regex = re.compile('"([^"]+)"')
97

98
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')
99
    for row in reader:
100
      attribute = row["soft_attribute"]
101
      judgment = SoftAttributeJudgment(
102
          attribute=attribute,
103
          rater_id=int(row["rater_id"]),
104
          reference_item=row["reference_title"],
105
          less_items=items_str_to_sets(row["less_than"]),
106
          same_items=items_str_to_sets(row["about_as"]),
107
          more_items=items_str_to_sets(row["more_than"]),
108
      )
109
      judgments[attribute].add(judgment)
110
  return {
111
      attr: frozenset(attr_judgments)
112
      for attr, attr_judgments in judgments.items()
113
  }
114

115

116
def convert_to_pairwise_preferences(
117
    judgment,
118
):
119
  """Convert a single rater's judgment to a set of pairwise preferences.
120

121
  This includes generating pairwise preferences between the reference item and
122
  all three sets, as well as between items in each of the sets.
123

124
  Args:
125
    judgment: A single raters judgment.
126

127
  Returns:
128
    A set of pairwise preferences between individual items.
129
  """
130

131
  def make_pref(smaller, larger, strength):
132
    if smaller == larger:
133
      raise ValueError("An item cannot have a preference relative to itself.")
134
    return PairwisePreference(
135
        attribute=judgment.attribute,
136
        rater_id=judgment.rater_id,
137
        smaller_item=smaller,
138
        larger_item=larger,
139
        preference_strength=strength,
140
    )
141

142
  def make_pref_set(
143
      smaller_set, larger_set, strength
144
  ):
145
    """Returns prefs for all items in smaller_set vs all items in larger_set."""
146
    tmp_preferences: Set[PairwisePreference] = set()
147
    for smaller in smaller_set:
148
      tmp_preferences.update(
149
          make_pref(smaller, larger, strength) for larger in larger_set
150
      )
151
    return tmp_preferences
152

153
  preferences: Set[PairwisePreference] = set()
154

155
  preferences.update(
156
      make_pref_set(
157
          judgment.less_items, frozenset({judgment.reference_item}), 1
158
      )
159
  )
160
  preferences.update(
161
      make_pref_set(
162
          frozenset({judgment.reference_item}), judgment.more_items, 1
163
      )
164
  )
165
  preferences.update(make_pref_set(judgment.less_items, judgment.same_items, 1))
166
  preferences.update(make_pref_set(judgment.same_items, judgment.more_items, 1))
167
  preferences.update(make_pref_set(judgment.less_items, judgment.more_items, 2))
168

169
  # Equality, only within the "same items", using lexical order for items to
170
  # ensure we don't duplicate preferences. Also, we assume that all items in the
171
  # same-as set are transitively same. This might not be the case if the
172
  # reference item is in the middle -- and items at the extreme of "about the
173
  # same" are not about the same as each other.
174
  for item in judgment.same_items:
175
    if item < judgment.reference_item:
176
      preferences.add(make_pref(item, judgment.reference_item, 0))
177
    else:
178
      preferences.add(make_pref(judgment.reference_item, item, 0))
179

180
    for item2 in judgment.same_items:
181
      if item < item2:
182
        preferences.add(make_pref(item, item2, 0))
183

184
  return preferences
185

186

187
def make_title_presentable(title, convert_a = True):
188
  """Makes titles more presentable, cleaning "X, The (year)" and "X, A (year)".
189

190
  This converts "X, The" to "The X" and (optionally) "X, A"/"X, An" to "A X"/"An
191
  X". It is necessary when data files come from different sources, with
192
  different formats.
193

194
  Args:
195
    title: Title to convert.
196
    convert_a: If set, also convert "X, A"/"X, An" to "A X"/"An X".
197

198
  Returns:
199
    Presentable format of title.
200
  """
201
  if convert_a:
202
    regex = re.compile(r"^([A-Za-z0-9:\.,& ']+), ((The)|(An)|(A))(.*)")
203
  else:
204
    regex = re.compile(r"^([A-Za-z0-9:\.,& ']+), (The)(.*)")
205
  match = regex.search(title)
206
  if match is None:
207
    return title
208
  return f"{match.group(2)} {match.group(1)}{match.group(regex.groups)}"
209
google-research

Использование cookies