4
from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
6
from .utils import DummyLoRAManager
8
TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
11
(8192 // 8, 1024 // 8, 1024 // 8),
13
(4096 // 2, 4096 // 2, 4096 // 2),
15
BATCH_SIZES = [8, 32, 256]
17
DTYPES = [torch.float16]
19
torch.float16: (5e-3, 5e-3),
20
torch.bfloat16: (3e-2, 2e-2),
24
@pytest.mark.parametrize("m", TENSOR_SIZES)
25
@pytest.mark.parametrize("n", TENSOR_SIZES)
26
@pytest.mark.parametrize("k", BATCH_SIZES)
27
@pytest.mark.parametrize("rank", RANKS)
28
@pytest.mark.parametrize("dtype", DTYPES)
29
def test_apply_lora(m, n, k, rank, dtype) -> None:
30
manager = DummyLoRAManager()
32
module_name = "module"
33
weight = torch.rand([m, n], device="cuda", dtype=dtype)
35
manager.init_random_lora(module_name, weight, rank=rank)
36
lora = manager.get_module_lora(module_name)
38
input = torch.rand(k, n, device="cuda", dtype=dtype)
39
expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
41
lora_a_stack = torch.zeros(8,
47
lora_b_stack = torch.zeros(8,
53
for i in range(lora_a_stack.shape[0]):
54
lora_a_stack[i][0] = lora.lora_a.T
55
lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
57
output = torch.zeros(k, m, device="cuda", dtype=dtype)
59
input, lora_a_stack, lora_b_stack,
60
torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
63
rtol, atol = TOLERANCES[dtype]
64
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
67
_apply_lora(input, lora_a_stack, lora_b_stack,
68
torch.full((len(input), ), -1, device="cuda"), output)
69
assert torch.allclose(torch.zeros_like(output), output)
74
@pytest.mark.parametrize("m", TENSOR_SIZES)
75
@pytest.mark.parametrize("n", TENSOR_SIZES)
76
@pytest.mark.parametrize("k", BATCH_SIZES)
77
@pytest.mark.parametrize("rank", RANKS)
78
@pytest.mark.parametrize("dtype", DTYPES)
79
def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
81
pytest.skip("m must be divisible by 2")
82
if m // 2 not in TENSOR_SIZES:
83
pytest.skip("m//2 must be in TENSOR_SIZES")
85
manager = DummyLoRAManager()
87
module_name = "module"
88
weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
90
manager.init_random_lora(module_name + "1", weight, rank=rank)
91
lora_1 = manager.get_module_lora(module_name + "1")
92
manager.init_random_lora(module_name + "2", weight, rank=rank)
93
lora_2 = manager.get_module_lora(module_name + "2")
95
input = torch.rand(k, n, device="cuda", dtype=dtype)
96
expected = torch.cat([
97
input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
98
input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
105
lora_1.lora_a.shape[1],
106
lora_1.lora_a.shape[0],
108
dtype=dtype) for i in range(2)
113
lora_1.lora_b.shape[1],
114
lora_1.lora_b.shape[0],
116
dtype=dtype) for i in range(2)
118
for i in range(lora_a_stacks[0].shape[0]):
119
lora_a_stacks[0][i][0] = lora_1.lora_a.T
120
lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
121
lora_a_stacks[1][i][0] = lora_2.lora_a.T
122
lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
124
output = torch.zeros(k, m, device="cuda", dtype=dtype)
125
_apply_lora_packed_nslice(
126
input, lora_a_stacks, lora_b_stacks,
128
lora_a_stacks[0].shape[0], (len(input), ),
129
device="cuda"), output, (m // 2, m // 2))
131
rtol, atol = TOLERANCES[dtype]
132
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
135
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
136
torch.full((len(input), ), -1, device="cuda"),
137
output, (m // 2, m // 2))
138
assert torch.allclose(torch.zeros_like(output), output)
143
@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
144
@pytest.mark.parametrize("n", TENSOR_SIZES)
145
@pytest.mark.parametrize("k", BATCH_SIZES)
146
@pytest.mark.parametrize("rank", RANKS)
147
@pytest.mark.parametrize("dtype", DTYPES)
148
def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
149
manager = DummyLoRAManager()
151
module_name = "module"
152
weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
153
weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
155
manager.init_random_lora(module_name + "q", weight_q, rank=rank)
156
lora_q = manager.get_module_lora(module_name + "q")
157
manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
158
lora_k = manager.get_module_lora(module_name + "k")
159
manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
160
lora_v = manager.get_module_lora(module_name + "v")
162
input = torch.rand(k, n, device="cuda", dtype=dtype)
163
expected = torch.cat([
164
input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
165
input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
166
input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
173
lora_q.lora_a.shape[1],
174
lora_q.lora_a.shape[0],
180
lora_k.lora_a.shape[1],
181
lora_k.lora_a.shape[0],
183
dtype=dtype) for i in range(2)
188
lora_q.lora_b.shape[1],
189
lora_q.lora_b.shape[0],
195
lora_k.lora_b.shape[1],
196
lora_k.lora_b.shape[0],
198
dtype=dtype) for i in range(2)
200
for i in range(lora_a_stacks[0].shape[0]):
201
lora_a_stacks[0][i][0] = lora_q.lora_a.T
202
lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
203
lora_a_stacks[1][i][0] = lora_k.lora_a.T
204
lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
205
lora_a_stacks[2][i][0] = lora_v.lora_a.T
206
lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
208
output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
209
_apply_lora_packed_nslice(
210
input, lora_a_stacks, lora_b_stacks,
212
lora_a_stacks[0].shape[0], (len(input), ),
213
device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
215
rtol, atol = TOLERANCES[dtype]
216
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
219
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
220
torch.full((len(input), ), -1, device="cuda"),
221
output, (qkv[0], qkv[1], qkv[2]))
222
assert torch.allclose(torch.zeros_like(output), output)