colossalai
69 строк · 2.1 Кб
1from typing import Optional, Union
2
3import torch
4import torch.nn.functional as F
5
6
7def _compute_approx_kl(
8log_probs: torch.Tensor, log_probs_base: torch.Tensor, action_mask: Optional[torch.Tensor] = None
9) -> torch.Tensor:
10"""
11Compute the approximate KL divergence between two distributions.
12Schulman blog: http://joschu.net/blog/kl-approx.html
13
14Args:
15log_probs: Log probabilities of the new distribution.
16log_probs_base: Log probabilities of the base distribution.
17action_mask: Mask for actions.
18"""
19
20log_ratio = log_probs_base - log_probs
21approx_kl = (log_ratio.exp() - 1) - log_ratio
22if action_mask is not None:
23approx_kl = masked_mean(approx_kl, action_mask, dim=1)
24return approx_kl
25approx_kl = approx_kl.mean(dim=1)
26return approx_kl
27
28
29def compute_reward(
30r: Union[torch.Tensor, float],
31kl_coef: float,
32log_probs: torch.Tensor,
33log_probs_base: torch.Tensor,
34action_mask: Optional[torch.Tensor] = None,
35) -> torch.Tensor:
36if kl_coef <= 0.0:
37return r
38kl = _compute_approx_kl(log_probs, log_probs_base, action_mask=action_mask)
39reward = r - kl_coef * kl
40return reward
41
42
43def _log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
44log_probs = F.log_softmax(logits, dim=-1)
45log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
46return log_probs_labels.squeeze(-1)
47
48
49def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
50"""Calculate action log probs.
51
52Args:
53output (torch.Tensor): Output tensor of Actor.forward.logits.
54sequences (torch.LongTensor): Input sequences.
55num_actions (int): Number of actions.
56
57Returns:
58torch.Tensor: Action log probs.
59"""
60log_probs = _log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
61return log_probs[:, -num_actions:]
62
63
64def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
65tensor = tensor * mask
66tensor = tensor.sum(dim=dim)
67mask_sum = mask.sum(dim=dim)
68mean = tensor / (mask_sum + 1e-8)
69return mean
70