google-research

score_calculation.py
304 строки · 10.7 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 The Google Research Authors.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
"""Functions for calculating human-simulated scores."""
17

18
import numpy as np
19
import pandas as pd
20

21
import tqdm
22

23
from psyborgs import survey_bench_lib
24

25

26
SPID = (
27
    'item_preamble_id',
28
    'item_postamble_id',
29
    'response_scale_id',
30
    'response_choice_postamble_id',
31
    'model_id',
32
)
33

34

35
def logsumexp(x):
36
  c = x.max()
37
  return c + np.log(np.sum(np.exp(x - c)))
38

39

40
def normalize_logprobs(x):
41
  diff = x - logsumexp(x)
42
  probs = np.exp(diff)
43
  return probs / np.sum(probs)
44

45

46
# for generative mode
47

48

49
def normalize_response_logprobs(df):
50
  """Converts generated responses and their log-prob scores to normalized probabilities."""
51
  grouped = df.groupby(
52
      list(SPID) + ['item_id', 'model_output'], group_keys=False
53
  )
54

55
  # calculate the log probability sums for each unique combination of item_id,
56
  # model_output, and list(SPID)
57
  logprobs_sum = grouped['model_output_score'].sum()
58

59
  # normalize the log probabilities for each unique combination of item_id,
60
  # list(SPID)
61
  normalized_probs = (
62
      logprobs_sum.groupby(list(SPID) + ['item_id'], group_keys=False)
63
      .apply(normalize_logprobs)
64
      .values
65
  )
66

67
  # create a new dataframe with the normalized probabilities
68
  raw_response_scores_normalized_df = pd.DataFrame({
69
      'item_preamble_id': logprobs_sum.index.get_level_values(
70
          'item_preamble_id'
71
      ),
72
      'item_id': logprobs_sum.index.get_level_values('item_id'),
73
      'item_postamble_id': logprobs_sum.index.get_level_values(
74
          'item_postamble_id'
75
      ),
76
      'response_scale_id': logprobs_sum.index.get_level_values(
77
          'response_scale_id'
78
      ),
79
      'response_choice_postamble_id': logprobs_sum.index.get_level_values(
80
          'response_choice_postamble_id'
81
      ),
82
      'model_id': logprobs_sum.index.get_level_values('model_id'),
83
      # convert these `response_value`s to ints
84
      'response_value': logprobs_sum.index.get_level_values(
85
          'model_output'
86
      ).astype(int),
87
      'score': normalized_probs,
88
  })
89

90
  # return the resulting dataframe
91
  return raw_response_scores_normalized_df
92

93

94
# reshape response choice probability scores
95
def reshape_response_choice_probability_scores(
96
    raw_response_scores_df,
97
):
98
  """Reshapes raw data into columns of LLM scores for every response choice.
99

100
  Args:
101
    raw_response_scores_df: A DataFrame containing raw response scores in long
102
      format, with one unique prompt-continuation (containing a single response
103
      choice) and an LLM score for this unique combination for each row. Columns
104
      should include `item_preamble_id`, `item_id`, `item_postamble_id`,
105
      `response_scale_id`, `response_choice_postamble_id`, `model_id`, and
106
      `response_value`.
107

108
  Returns:
109
    A DataFrame containing raw response scores in wide format. Each row
110
      contains IDs representing a unique prompt-continuation specification and
111
      the LLM float scores for each response choice considered in the
112
      specification. Score columns are labeled in the format of `item_id` +
113
      `_` + `response_choice_value`. The column of the item `brsf1`'s
114
      response choice value of 1 would therefore be `brsf1_1`.
115
  """
116
  # create pivot table of response choice probabilities nested under item IDs
117
  df_raw_wide = raw_response_scores_df.pivot_table(
118
      index=list(SPID), columns=['item_id', 'response_value'], values=['score']
119
  )
120

121
  # collapse pivot table into flat column names representing item IDs paired
122
  # with response scale values
123
  df_raw_wide.columns = [
124
      f'{item_id}_{response_value}'
125
      for _, item_id, response_value in df_raw_wide.columns
126
  ]
127

128
  # reset index
129
  df_raw_wide = df_raw_wide.reset_index()
130

131
  return df_raw_wide
132

133

134
# determine human-simulated response values
135
def calculate_human_simulated_responses(
136
    raw_response_scores_df,
137
):
138
  """Selects the most likely response choices to simulate human responses.
139

140
  This function simulates human responses to individual survey measure items
141
    by 'selecting' response choices with the highest LLM probability score.
142
    The response value (an integer within the range of a given response
143
    scale) for the selected response choice is used for calculation of human-
144
    simulated scale scores.
145

146
    For instance, the LLM scores for item 'aa1' using a 5-point Likert scale
147
    might be .20, .40, .60, .80, and 1.00 for the response choices
148
    'strongly disagree', 'disagree', 'neither agree nor disagree', 'agree',
149
    and 'strongly agree', respectively. To simulate a human response to this
150
    item, we select 'strongly agree', the response choice with the highest
151
    LLM score. The corresponding integer value for 'strongly agree' on the
152
    5-point response scale is 5. Therefore, the simulated human response to
153
    item 'aa1' would be 5.
154

155
  Args:
156
    raw_response_scores_df: A DataFrame containing raw response scores in long
157
      format, with one unique prompt-continuation (containing a single response
158
      choice) and an LLM score for this unique combination for each row. Columns
159
      should include `item_preamble_id`, `item_id`, `item_postamble_id`,
160
      `response_scale_id`, `response_choice_postamble_id`, `model_id`, and
161
      `response_value`.
162

163
  Returns:
164
    A DataFrame containing prompt-continuation specification data and columns
165
      of human-simulated integer response values labeled by `item_id`.
166
  """
167
  # register `pandas.progress_apply` with tqdm
168
  tqdm.tqdm.pandas()
169

170
  print('Determining the most likely response choice per item. ')
171
  print('This could take a while! ... ')
172
  # retrieve rows with the most likely response choice
173
  df_item_responses = raw_response_scores_df.loc[
174
      raw_response_scores_df.groupby(list(SPID) + ['item_id'])['score'].idxmax()
175
  ].reset_index(drop=True)
176

177
  # reshape to wide
178
  df_simulated_item_responses_wide = (
179
      df_item_responses.pivot(
180
          index=list(SPID), columns=['item_id'], values='response_value'
181
      )
182
      .reset_index()
183
      .rename_axis(index=None, columns=None)
184
  )
185

186
  return df_simulated_item_responses_wide
187

188

189
# combined raw LLM scores and human-simulated choices into one DataFrame
190
def get_raw_and_simulated_responses(
191
    raw_response_scores_df, generative_mode = False
192
):
193
  """Returns combined DataFrame of raw LLM scores and simulated responses."""
194
  # if response data was created in generative mode, normalize model_output
195
  # log probabilities to probabilities.
196
  if generative_mode:
197
    raw_response_scores_df = normalize_response_logprobs(
198
        raw_response_scores_df
199
    )
200

201
  # reshape raw LLM response choice scores
202
  print('Reshaping raw LLM response choice scores... ')
203
  df_raw_reshaped = reshape_response_choice_probability_scores(
204
      raw_response_scores_df
205
  )
206

207
  # calculate and reshape human simulated item responses
208
  print('Calculating and reshaping human-simulated item responses... ')
209
  df_simulated_item_responses = calculate_human_simulated_responses(
210
      raw_response_scores_df
211
  )
212

213
  # combine the above into one DataFrame
214
  print(
215
      'Combining LLM scores and human-simulated responses into one'
216
      ' DataFrame... '
217
  )
218

219
  return df_simulated_item_responses.merge(df_raw_reshaped, how='inner')
220

221

222
# calculate session scale scores
223
def score_session(
224
    admin_session,
225
    raw_response_scores_df,
226
    verbose = False,
227
):
228
  """Calculates human-simulated scores from AdministrationSession results.
229

230
  This function treats each unique prompt-continuation specification as a
231
    simulated participant, indexed by a simulated participant ID (list(SPID)).
232
    Iterating through each multi-item scale in an AdministrationSession, it
233
    calculates a summary score for each list(SPID) by taking the average of all
234
    item response values (accounting for reverse-keyed items).
235

236
  Args:
237
    admin_session: An AdministrationSession.
238
    raw_response_scores_df: A DataFrame containing raw response scores in long
239
      format, with one unique prompt-continuation (containing a single response
240
      choice) and an LLM score for this unique combination for each row. Columns
241
      should include `item_preamble_id`, `item_id`, `item_postamble_id`,
242
      `response_scale_id`, `response_choice_postamble_id`, `model_id`, and
243
      `response_value`.
244
    verbose: A boolean. Prints simulated scores for debugging if True.
245

246
  Returns:
247
    A DataFrame of raw LLM scores, human-simulated response values, and human-
248
      simulated scale scores. Scale scores are labeled by `scale_id`.
249
  """
250
  measures = admin_session.measures
251
  scored_session_df = get_raw_and_simulated_responses(raw_response_scores_df)
252

253
  # for each scale, score simulated participants (list(SPID)s)
254
  for measure in measures.values():
255
    for scale_id, scale in measure.scales.items():
256
      # get scale scoring info
257
      item_ids = scale.item_ids
258
      reverse_keyed_item_ids = scale.reverse_keyed_item_ids
259
      response_scale_ids = scale.response_scale_ids
260
      scale_length = len(item_ids)
261

262
      # for each response scale type, score columnwise
263
      for response_scale_id in response_scale_ids:
264
        scale_point_range = len(
265
            admin_session.response_scales[response_scale_id].response_choices
266
        )
267

268
        # only work on rows that use the current response_scale
269
        df_response_scale_id_col = scored_session_df['response_scale_id']
270

271
        item_values = []
272

273
        for item_id in item_ids:
274
          original_values = scored_session_df[
275
              df_response_scale_id_col == response_scale_id
276
          ][item_id]
277

278
          # reverse key item value column if its item_id is in
279
          # reverse_keyed_item_ids; otherwise, keep the values the same
280
          if item_id in reverse_keyed_item_ids:
281
            processed_values = scale_point_range - original_values + 1
282
          else:
283
            processed_values = original_values
284

285
          item_values.append(processed_values)
286

287
        simulated_scale_scores = sum(item_values) / scale_length
288

289
        if verbose:
290
          print(
291
              'Simulated "'
292
              + response_scale_id
293
              + '" scale scores for '
294
              + scale_id
295
              + ': \n'
296
              + str(simulated_scale_scores)
297
              + '\n'
298
          )
299

300
        scored_session_df.loc[
301
            df_response_scale_id_col == response_scale_id, scale_id
302
        ] = simulated_scale_scores
303

304
  return scored_session_df
305
google-research

Использование cookies