google-research
208 строк · 6.8 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Library to extracts pairwise preferences from soft attribute data collection.
17
18This library takes as input three-way bucketed data about soft attributes (with
19items classified as about the same, more or less relative to an anchor item, see
20https://github.com/google-research-datasets/soft-attributes). It processes this
21data into more useful forms, using the
22SoftAttributeJudgment class as the basic representation.
23"""
24
25import collections26import csv27import re28from typing import FrozenSet, Mapping, MutableMapping, Set, Text29
30import attr31from tensorflow.io import gfile32
33
34# eq=False since equality by value is meaningless (each entry is one rating).
35@attr.s(auto_attribs=True, frozen=True)36class SoftAttributeJudgment:37"""Class to represent judgments made on soft attributes per https://github.com/google-research-datasets/soft-attributes.38
39Each judgments consists of:
40- An attribute name
41- A rater id
42- A reference item name
43- Item names placed into each of the three buckets (less/same/more).
44"""
45
46attribute: Text47rater_id: int48reference_item: Text49less_items: FrozenSet[Text]50same_items: FrozenSet[Text]51more_items: FrozenSet[Text]52
53
54@attr.s(auto_attribs=True, frozen=True)55class PairwisePreference:56"""Class representing a pairwise preference between two items.57
58If preference_strength is 0, it should be interpreted as:
59"Item <smaller_item> is ABOUT THE SAME <attribute> as <larger item>
60If preference_strength is 1, it should be interpreted as:
61"Item <smaller_item> is LESS <attribute> than <larger_item>.".
62If preference_strength is 2, it should be interpreted as:
63"Item <smaller_item> is MUCH LESS <attribute> than <larger_item>."
64"""
65
66attribute: Text67rater_id: int68smaller_item: Text69larger_item: Text70preference_strength: int71
72
73def load_judgments(74filename,75):76"""Loads ratings from filename, returns a dictionary of judgments.77
78Args:
79filename: The name of CSV file with user ratings.
80
81Returns:
82A dictionary from attribute name to all judgments for that attribute.
83"""
84
85def items_str_to_sets(s):86# This is an awkward format sometimes in the data file:87# ["item,with,commas","item2","item3"]88if s.startswith("[") and s.endswith("]"):89return frozenset(match for match in re.findall(str_split_regex, s))90return frozenset(filter(None, s.split(sep=",")))91
92judgments: MutableMapping[Text, Set[SoftAttributeJudgment]] = (93collections.defaultdict(set)94)95with gfile.GFile(filename, "r") as csv_file:96str_split_regex = re.compile('"([^"]+)"')97
98reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')99for row in reader:100attribute = row["soft_attribute"]101judgment = SoftAttributeJudgment(102attribute=attribute,103rater_id=int(row["rater_id"]),104reference_item=row["reference_title"],105less_items=items_str_to_sets(row["less_than"]),106same_items=items_str_to_sets(row["about_as"]),107more_items=items_str_to_sets(row["more_than"]),108)109judgments[attribute].add(judgment)110return {111attr: frozenset(attr_judgments)112for attr, attr_judgments in judgments.items()113}114
115
116def convert_to_pairwise_preferences(117judgment,118):119"""Convert a single rater's judgment to a set of pairwise preferences.120
121This includes generating pairwise preferences between the reference item and
122all three sets, as well as between items in each of the sets.
123
124Args:
125judgment: A single raters judgment.
126
127Returns:
128A set of pairwise preferences between individual items.
129"""
130
131def make_pref(smaller, larger, strength):132if smaller == larger:133raise ValueError("An item cannot have a preference relative to itself.")134return PairwisePreference(135attribute=judgment.attribute,136rater_id=judgment.rater_id,137smaller_item=smaller,138larger_item=larger,139preference_strength=strength,140)141
142def make_pref_set(143smaller_set, larger_set, strength144):145"""Returns prefs for all items in smaller_set vs all items in larger_set."""146tmp_preferences: Set[PairwisePreference] = set()147for smaller in smaller_set:148tmp_preferences.update(149make_pref(smaller, larger, strength) for larger in larger_set150)151return tmp_preferences152
153preferences: Set[PairwisePreference] = set()154
155preferences.update(156make_pref_set(157judgment.less_items, frozenset({judgment.reference_item}), 1158)159)160preferences.update(161make_pref_set(162frozenset({judgment.reference_item}), judgment.more_items, 1163)164)165preferences.update(make_pref_set(judgment.less_items, judgment.same_items, 1))166preferences.update(make_pref_set(judgment.same_items, judgment.more_items, 1))167preferences.update(make_pref_set(judgment.less_items, judgment.more_items, 2))168
169# Equality, only within the "same items", using lexical order for items to170# ensure we don't duplicate preferences. Also, we assume that all items in the171# same-as set are transitively same. This might not be the case if the172# reference item is in the middle -- and items at the extreme of "about the173# same" are not about the same as each other.174for item in judgment.same_items:175if item < judgment.reference_item:176preferences.add(make_pref(item, judgment.reference_item, 0))177else:178preferences.add(make_pref(judgment.reference_item, item, 0))179
180for item2 in judgment.same_items:181if item < item2:182preferences.add(make_pref(item, item2, 0))183
184return preferences185
186
187def make_title_presentable(title, convert_a = True):188"""Makes titles more presentable, cleaning "X, The (year)" and "X, A (year)".189
190This converts "X, The" to "The X" and (optionally) "X, A"/"X, An" to "A X"/"An
191X". It is necessary when data files come from different sources, with
192different formats.
193
194Args:
195title: Title to convert.
196convert_a: If set, also convert "X, A"/"X, An" to "A X"/"An X".
197
198Returns:
199Presentable format of title.
200"""
201if convert_a:202regex = re.compile(r"^([A-Za-z0-9:\.,& ']+), ((The)|(An)|(A))(.*)")203else:204regex = re.compile(r"^([A-Za-z0-9:\.,& ']+), (The)(.*)")205match = regex.search(title)206if match is None:207return title208return f"{match.group(2)} {match.group(1)}{match.group(regex.groups)}"209