google-research
304 строки · 10.7 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Functions for calculating human-simulated scores."""
17
18import numpy as np
19import pandas as pd
20
21import tqdm
22
23from psyborgs import survey_bench_lib
24
25
26SPID = (
27'item_preamble_id',
28'item_postamble_id',
29'response_scale_id',
30'response_choice_postamble_id',
31'model_id',
32)
33
34
35def logsumexp(x):
36c = x.max()
37return c + np.log(np.sum(np.exp(x - c)))
38
39
40def normalize_logprobs(x):
41diff = x - logsumexp(x)
42probs = np.exp(diff)
43return probs / np.sum(probs)
44
45
46# for generative mode
47
48
49def normalize_response_logprobs(df):
50"""Converts generated responses and their log-prob scores to normalized probabilities."""
51grouped = df.groupby(
52list(SPID) + ['item_id', 'model_output'], group_keys=False
53)
54
55# calculate the log probability sums for each unique combination of item_id,
56# model_output, and list(SPID)
57logprobs_sum = grouped['model_output_score'].sum()
58
59# normalize the log probabilities for each unique combination of item_id,
60# list(SPID)
61normalized_probs = (
62logprobs_sum.groupby(list(SPID) + ['item_id'], group_keys=False)
63.apply(normalize_logprobs)
64.values
65)
66
67# create a new dataframe with the normalized probabilities
68raw_response_scores_normalized_df = pd.DataFrame({
69'item_preamble_id': logprobs_sum.index.get_level_values(
70'item_preamble_id'
71),
72'item_id': logprobs_sum.index.get_level_values('item_id'),
73'item_postamble_id': logprobs_sum.index.get_level_values(
74'item_postamble_id'
75),
76'response_scale_id': logprobs_sum.index.get_level_values(
77'response_scale_id'
78),
79'response_choice_postamble_id': logprobs_sum.index.get_level_values(
80'response_choice_postamble_id'
81),
82'model_id': logprobs_sum.index.get_level_values('model_id'),
83# convert these `response_value`s to ints
84'response_value': logprobs_sum.index.get_level_values(
85'model_output'
86).astype(int),
87'score': normalized_probs,
88})
89
90# return the resulting dataframe
91return raw_response_scores_normalized_df
92
93
94# reshape response choice probability scores
95def reshape_response_choice_probability_scores(
96raw_response_scores_df,
97):
98"""Reshapes raw data into columns of LLM scores for every response choice.
99
100Args:
101raw_response_scores_df: A DataFrame containing raw response scores in long
102format, with one unique prompt-continuation (containing a single response
103choice) and an LLM score for this unique combination for each row. Columns
104should include `item_preamble_id`, `item_id`, `item_postamble_id`,
105`response_scale_id`, `response_choice_postamble_id`, `model_id`, and
106`response_value`.
107
108Returns:
109A DataFrame containing raw response scores in wide format. Each row
110contains IDs representing a unique prompt-continuation specification and
111the LLM float scores for each response choice considered in the
112specification. Score columns are labeled in the format of `item_id` +
113`_` + `response_choice_value`. The column of the item `brsf1`'s
114response choice value of 1 would therefore be `brsf1_1`.
115"""
116# create pivot table of response choice probabilities nested under item IDs
117df_raw_wide = raw_response_scores_df.pivot_table(
118index=list(SPID), columns=['item_id', 'response_value'], values=['score']
119)
120
121# collapse pivot table into flat column names representing item IDs paired
122# with response scale values
123df_raw_wide.columns = [
124f'{item_id}_{response_value}'
125for _, item_id, response_value in df_raw_wide.columns
126]
127
128# reset index
129df_raw_wide = df_raw_wide.reset_index()
130
131return df_raw_wide
132
133
134# determine human-simulated response values
135def calculate_human_simulated_responses(
136raw_response_scores_df,
137):
138"""Selects the most likely response choices to simulate human responses.
139
140This function simulates human responses to individual survey measure items
141by 'selecting' response choices with the highest LLM probability score.
142The response value (an integer within the range of a given response
143scale) for the selected response choice is used for calculation of human-
144simulated scale scores.
145
146For instance, the LLM scores for item 'aa1' using a 5-point Likert scale
147might be .20, .40, .60, .80, and 1.00 for the response choices
148'strongly disagree', 'disagree', 'neither agree nor disagree', 'agree',
149and 'strongly agree', respectively. To simulate a human response to this
150item, we select 'strongly agree', the response choice with the highest
151LLM score. The corresponding integer value for 'strongly agree' on the
1525-point response scale is 5. Therefore, the simulated human response to
153item 'aa1' would be 5.
154
155Args:
156raw_response_scores_df: A DataFrame containing raw response scores in long
157format, with one unique prompt-continuation (containing a single response
158choice) and an LLM score for this unique combination for each row. Columns
159should include `item_preamble_id`, `item_id`, `item_postamble_id`,
160`response_scale_id`, `response_choice_postamble_id`, `model_id`, and
161`response_value`.
162
163Returns:
164A DataFrame containing prompt-continuation specification data and columns
165of human-simulated integer response values labeled by `item_id`.
166"""
167# register `pandas.progress_apply` with tqdm
168tqdm.tqdm.pandas()
169
170print('Determining the most likely response choice per item. ')
171print('This could take a while! ... ')
172# retrieve rows with the most likely response choice
173df_item_responses = raw_response_scores_df.loc[
174raw_response_scores_df.groupby(list(SPID) + ['item_id'])['score'].idxmax()
175].reset_index(drop=True)
176
177# reshape to wide
178df_simulated_item_responses_wide = (
179df_item_responses.pivot(
180index=list(SPID), columns=['item_id'], values='response_value'
181)
182.reset_index()
183.rename_axis(index=None, columns=None)
184)
185
186return df_simulated_item_responses_wide
187
188
189# combined raw LLM scores and human-simulated choices into one DataFrame
190def get_raw_and_simulated_responses(
191raw_response_scores_df, generative_mode = False
192):
193"""Returns combined DataFrame of raw LLM scores and simulated responses."""
194# if response data was created in generative mode, normalize model_output
195# log probabilities to probabilities.
196if generative_mode:
197raw_response_scores_df = normalize_response_logprobs(
198raw_response_scores_df
199)
200
201# reshape raw LLM response choice scores
202print('Reshaping raw LLM response choice scores... ')
203df_raw_reshaped = reshape_response_choice_probability_scores(
204raw_response_scores_df
205)
206
207# calculate and reshape human simulated item responses
208print('Calculating and reshaping human-simulated item responses... ')
209df_simulated_item_responses = calculate_human_simulated_responses(
210raw_response_scores_df
211)
212
213# combine the above into one DataFrame
214print(
215'Combining LLM scores and human-simulated responses into one'
216' DataFrame... '
217)
218
219return df_simulated_item_responses.merge(df_raw_reshaped, how='inner')
220
221
222# calculate session scale scores
223def score_session(
224admin_session,
225raw_response_scores_df,
226verbose = False,
227):
228"""Calculates human-simulated scores from AdministrationSession results.
229
230This function treats each unique prompt-continuation specification as a
231simulated participant, indexed by a simulated participant ID (list(SPID)).
232Iterating through each multi-item scale in an AdministrationSession, it
233calculates a summary score for each list(SPID) by taking the average of all
234item response values (accounting for reverse-keyed items).
235
236Args:
237admin_session: An AdministrationSession.
238raw_response_scores_df: A DataFrame containing raw response scores in long
239format, with one unique prompt-continuation (containing a single response
240choice) and an LLM score for this unique combination for each row. Columns
241should include `item_preamble_id`, `item_id`, `item_postamble_id`,
242`response_scale_id`, `response_choice_postamble_id`, `model_id`, and
243`response_value`.
244verbose: A boolean. Prints simulated scores for debugging if True.
245
246Returns:
247A DataFrame of raw LLM scores, human-simulated response values, and human-
248simulated scale scores. Scale scores are labeled by `scale_id`.
249"""
250measures = admin_session.measures
251scored_session_df = get_raw_and_simulated_responses(raw_response_scores_df)
252
253# for each scale, score simulated participants (list(SPID)s)
254for measure in measures.values():
255for scale_id, scale in measure.scales.items():
256# get scale scoring info
257item_ids = scale.item_ids
258reverse_keyed_item_ids = scale.reverse_keyed_item_ids
259response_scale_ids = scale.response_scale_ids
260scale_length = len(item_ids)
261
262# for each response scale type, score columnwise
263for response_scale_id in response_scale_ids:
264scale_point_range = len(
265admin_session.response_scales[response_scale_id].response_choices
266)
267
268# only work on rows that use the current response_scale
269df_response_scale_id_col = scored_session_df['response_scale_id']
270
271item_values = []
272
273for item_id in item_ids:
274original_values = scored_session_df[
275df_response_scale_id_col == response_scale_id
276][item_id]
277
278# reverse key item value column if its item_id is in
279# reverse_keyed_item_ids; otherwise, keep the values the same
280if item_id in reverse_keyed_item_ids:
281processed_values = scale_point_range - original_values + 1
282else:
283processed_values = original_values
284
285item_values.append(processed_values)
286
287simulated_scale_scores = sum(item_values) / scale_length
288
289if verbose:
290print(
291'Simulated "'
292+ response_scale_id
293+ '" scale scores for '
294+ scale_id
295+ ': \n'
296+ str(simulated_scale_scores)
297+ '\n'
298)
299
300scored_session_df.loc[
301df_response_scale_id_col == response_scale_id, scale_id
302] = simulated_scale_scores
303
304return scored_session_df
305