ncnn

Форк
0
/
allocator.cpp 
2179 строк · 73.3 Кб
1
// Tencent is pleased to support the open source community by making ncnn available.
2
//
3
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4
//
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
7
//
8
// https://opensource.org/licenses/BSD-3-Clause
9
//
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
14

15
#include "allocator.h"
16

17
#include "gpu.h"
18
#include "pipeline.h"
19

20
#if __ANDROID_API__ >= 26
21
#include <android/hardware_buffer.h>
22
#endif // __ANDROID_API__ >= 26
23

24
namespace ncnn {
25

26
Allocator::~Allocator()
27
{
28
}
29

30
class PoolAllocatorPrivate
31
{
32
public:
33
    Mutex budgets_lock;
34
    Mutex payouts_lock;
35
    unsigned int size_compare_ratio; // 0~256
36
    size_t size_drop_threshold;
37
    std::list<std::pair<size_t, void*> > budgets;
38
    std::list<std::pair<size_t, void*> > payouts;
39
};
40

41
PoolAllocator::PoolAllocator()
42
    : Allocator(), d(new PoolAllocatorPrivate)
43
{
44
    d->size_compare_ratio = 0;
45
    d->size_drop_threshold = 10;
46
}
47

48
PoolAllocator::~PoolAllocator()
49
{
50
    clear();
51

52
    if (!d->payouts.empty())
53
    {
54
        NCNN_LOGE("FATAL ERROR! pool allocator destroyed too early");
55
#if NCNN_STDIO
56
        std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
57
        for (; it != d->payouts.end(); ++it)
58
        {
59
            void* ptr = it->second;
60
            NCNN_LOGE("%p still in use", ptr);
61
        }
62
#endif
63
    }
64

65
    delete d;
66
}
67

68
PoolAllocator::PoolAllocator(const PoolAllocator&)
69
    : d(0)
70
{
71
}
72

73
PoolAllocator& PoolAllocator::operator=(const PoolAllocator&)
74
{
75
    return *this;
76
}
77

78
void PoolAllocator::clear()
79
{
80
    d->budgets_lock.lock();
81

82
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
83
    for (; it != d->budgets.end(); ++it)
84
    {
85
        void* ptr = it->second;
86
        ncnn::fastFree(ptr);
87
    }
88
    d->budgets.clear();
89

90
    d->budgets_lock.unlock();
91
}
92

93
void PoolAllocator::set_size_compare_ratio(float scr)
94
{
95
    if (scr < 0.f || scr > 1.f)
96
    {
97
        NCNN_LOGE("invalid size compare ratio %f", scr);
98
        return;
99
    }
100

101
    d->size_compare_ratio = (unsigned int)(scr * 256);
102
}
103

104
void PoolAllocator::set_size_drop_threshold(size_t threshold)
105
{
106
    d->size_drop_threshold = threshold;
107
}
108

109
void* PoolAllocator::fastMalloc(size_t size)
110
{
111
    d->budgets_lock.lock();
112

113
    // find free budget
114
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
115
    for (; it != d->budgets.end(); ++it)
116
    {
117
        size_t bs = it->first;
118

119
        // size_compare_ratio ~ 100%
120
        if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
121
        {
122
            void* ptr = it->second;
123

124
            d->budgets.erase(it);
125

126
            d->budgets_lock.unlock();
127

128
            d->payouts_lock.lock();
129

130
            d->payouts.push_back(std::make_pair(bs, ptr));
131

132
            d->payouts_lock.unlock();
133

134
            return ptr;
135
        }
136

137
        if (bs < it_min->first)
138
        {
139
            it_min = it;
140
        }
141
        if (bs > it_max->first)
142
        {
143
            it_max = it;
144
        }
145
    }
146

147
    if (d->budgets.size() >= d->size_drop_threshold)
148
    {
149
        // All chunks in pool are not chosen. Then try to drop some outdated
150
        // chunks and return them to OS.
151
        if (it_max->first < size)
152
        {
153
            // Current query is asking for a chunk larger than any cached chunks.
154
            // Then remove the smallest one.
155
            ncnn::fastFree(it_min->second);
156
            d->budgets.erase(it_min);
157
        }
158
        else if (it_min->first > size)
159
        {
160
            // Current query is asking for a chunk smaller than any cached chunks.
161
            // Then remove the largest one.
162
            ncnn::fastFree(it_max->second);
163
            d->budgets.erase(it_max);
164
        }
165
    }
166

167
    d->budgets_lock.unlock();
168

169
    // new
170
    void* ptr = ncnn::fastMalloc(size);
171

172
    d->payouts_lock.lock();
173

174
    d->payouts.push_back(std::make_pair(size, ptr));
175

176
    d->payouts_lock.unlock();
177

178
    return ptr;
179
}
180

181
void PoolAllocator::fastFree(void* ptr)
182
{
183
    d->payouts_lock.lock();
184

185
    // return to budgets
186
    std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
187
    for (; it != d->payouts.end(); ++it)
188
    {
189
        if (it->second == ptr)
190
        {
191
            size_t size = it->first;
192

193
            d->payouts.erase(it);
194

195
            d->payouts_lock.unlock();
196

197
            d->budgets_lock.lock();
198

199
            d->budgets.push_back(std::make_pair(size, ptr));
200

201
            d->budgets_lock.unlock();
202

203
            return;
204
        }
205
    }
206

207
    d->payouts_lock.unlock();
208

209
    NCNN_LOGE("FATAL ERROR! pool allocator get wild %p", ptr);
210
    ncnn::fastFree(ptr);
211
}
212

213
class UnlockedPoolAllocatorPrivate
214
{
215
public:
216
    unsigned int size_compare_ratio; // 0~256
217
    size_t size_drop_threshold;
218
    std::list<std::pair<size_t, void*> > budgets;
219
    std::list<std::pair<size_t, void*> > payouts;
220
};
221

222
UnlockedPoolAllocator::UnlockedPoolAllocator()
223
    : Allocator(), d(new UnlockedPoolAllocatorPrivate)
224
{
225
    d->size_compare_ratio = 0;
226
    d->size_drop_threshold = 10;
227
}
228

229
UnlockedPoolAllocator::~UnlockedPoolAllocator()
230
{
231
    clear();
232

233
    if (!d->payouts.empty())
234
    {
235
        NCNN_LOGE("FATAL ERROR! unlocked pool allocator destroyed too early");
236
#if NCNN_STDIO
237
        std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
238
        for (; it != d->payouts.end(); ++it)
239
        {
240
            void* ptr = it->second;
241
            NCNN_LOGE("%p still in use", ptr);
242
        }
243
#endif
244
    }
245

246
    delete d;
247
}
248

249
UnlockedPoolAllocator::UnlockedPoolAllocator(const UnlockedPoolAllocator&)
250
    : d(0)
251
{
252
}
253

254
UnlockedPoolAllocator& UnlockedPoolAllocator::operator=(const UnlockedPoolAllocator&)
255
{
256
    return *this;
257
}
258

259
void UnlockedPoolAllocator::clear()
260
{
261
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
262
    for (; it != d->budgets.end(); ++it)
263
    {
264
        void* ptr = it->second;
265
        ncnn::fastFree(ptr);
266
    }
267
    d->budgets.clear();
268
}
269

270
void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
271
{
272
    if (scr < 0.f || scr > 1.f)
273
    {
274
        NCNN_LOGE("invalid size compare ratio %f", scr);
275
        return;
276
    }
277

278
    d->size_compare_ratio = (unsigned int)(scr * 256);
279
}
280

281
void UnlockedPoolAllocator::set_size_drop_threshold(size_t threshold)
282
{
283
    d->size_drop_threshold = threshold;
284
}
285

286
void* UnlockedPoolAllocator::fastMalloc(size_t size)
287
{
288
    // find free budget
289
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
290
    for (; it != d->budgets.end(); ++it)
291
    {
292
        size_t bs = it->first;
293

294
        // size_compare_ratio ~ 100%
295
        if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
296
        {
297
            void* ptr = it->second;
298

299
            d->budgets.erase(it);
300

301
            d->payouts.push_back(std::make_pair(bs, ptr));
302

303
            return ptr;
304
        }
305

306
        if (bs > it_max->first)
307
        {
308
            it_max = it;
309
        }
310
        if (bs < it_min->first)
311
        {
312
            it_min = it;
313
        }
314
    }
315

316
    if (d->budgets.size() >= d->size_drop_threshold)
317
    {
318
        if (it_max->first < size)
319
        {
320
            ncnn::fastFree(it_min->second);
321
            d->budgets.erase(it_min);
322
        }
323
        else if (it_min->first > size)
324
        {
325
            ncnn::fastFree(it_max->second);
326
            d->budgets.erase(it_max);
327
        }
328
    }
329

330
    // new
331
    void* ptr = ncnn::fastMalloc(size);
332

333
    d->payouts.push_back(std::make_pair(size, ptr));
334

335
    return ptr;
336
}
337

338
void UnlockedPoolAllocator::fastFree(void* ptr)
339
{
340
    // return to budgets
341
    std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
342
    for (; it != d->payouts.end(); ++it)
343
    {
344
        if (it->second == ptr)
345
        {
346
            size_t size = it->first;
347

348
            d->payouts.erase(it);
349

350
            d->budgets.push_back(std::make_pair(size, ptr));
351

352
            return;
353
        }
354
    }
355

356
    NCNN_LOGE("FATAL ERROR! unlocked pool allocator get wild %p", ptr);
357
    ncnn::fastFree(ptr);
358
}
359

360
#if NCNN_VULKAN
361
VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
362
    : vkdev(_vkdev)
363
{
364
    buffer_memory_type_index = (uint32_t)-1;
365
    image_memory_type_index = (uint32_t)-1;
366
    reserved_type_index = (uint32_t)-1;
367
    mappable = false;
368
    coherent = false;
369
}
370

371
VkAllocator::~VkAllocator()
372
{
373
    clear();
374
}
375

376
void VkAllocator::clear()
377
{
378
}
379

380
static inline size_t round_up(size_t n, size_t multiple)
381
{
382
    return (n + multiple - 1) / multiple * multiple;
383
}
384

385
static inline size_t round_down(size_t n, size_t multiple)
386
{
387
    return n / multiple * multiple;
388
}
389

390
int VkAllocator::flush(VkBufferMemory* ptr)
391
{
392
    if (coherent)
393
        return 0;
394

395
    VkMappedMemoryRange mappedMemoryRange;
396
    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
397
    mappedMemoryRange.pNext = 0;
398
    mappedMemoryRange.memory = ptr->memory;
399
    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
400
    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
401

402
    VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
403
    if (ret != VK_SUCCESS)
404
    {
405
        NCNN_LOGE("vkFlushMappedMemoryRanges failed %d", ret);
406
        return -1;
407
    }
408

409
    return 0;
410
}
411

412
int VkAllocator::invalidate(VkBufferMemory* ptr)
413
{
414
    if (coherent)
415
        return 0;
416

417
    VkMappedMemoryRange mappedMemoryRange;
418
    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
419
    mappedMemoryRange.pNext = 0;
420
    mappedMemoryRange.memory = ptr->memory;
421
    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
422
    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
423

424
    VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
425
    if (ret != VK_SUCCESS)
426
    {
427
        NCNN_LOGE("vkInvalidateMappedMemoryRanges failed %d", ret);
428
        return -1;
429
    }
430

431
    return 0;
432
}
433

434
VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
435
{
436
    VkBufferCreateInfo bufferCreateInfo;
437
    bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
438
    bufferCreateInfo.pNext = 0;
439
    bufferCreateInfo.flags = 0;
440
    bufferCreateInfo.size = size;
441
    bufferCreateInfo.usage = usage;
442
    bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
443
    bufferCreateInfo.queueFamilyIndexCount = 0;
444
    bufferCreateInfo.pQueueFamilyIndices = 0;
445

446
    VkBuffer buffer = 0;
447
    VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
448
    if (ret != VK_SUCCESS)
449
    {
450
        NCNN_LOGE("vkCreateBuffer failed %d", ret);
451
        return 0;
452
    }
453

454
    return buffer;
455
}
456

457
VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
458
{
459
    VkMemoryAllocateInfo memoryAllocateInfo;
460
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
461
    memoryAllocateInfo.pNext = 0;
462
    memoryAllocateInfo.allocationSize = size;
463
    memoryAllocateInfo.memoryTypeIndex = memory_type_index;
464

465
    VkDeviceMemory memory = 0;
466
    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
467
    if (ret != VK_SUCCESS)
468
    {
469
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
470
        return 0;
471
    }
472

473
    return memory;
474
}
475

476
VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
477
{
478
    VkMemoryAllocateInfo memoryAllocateInfo;
479
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
480
    memoryAllocateInfo.pNext = 0;
481
    memoryAllocateInfo.allocationSize = size;
482
    memoryAllocateInfo.memoryTypeIndex = memory_type_index;
483

484
    VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
485
    memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
486
    memoryDedicatedAllocateInfo.pNext = 0;
487
    memoryDedicatedAllocateInfo.image = image;
488
    memoryDedicatedAllocateInfo.buffer = buffer;
489
    memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
490

491
    VkDeviceMemory memory = 0;
492
    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
493
    if (ret != VK_SUCCESS)
494
    {
495
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
496
        return 0;
497
    }
498

499
    return memory;
500
}
501

502
VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
503
{
504
    VkImageCreateInfo imageCreateInfo;
505
    imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
506
    imageCreateInfo.pNext = 0;
507
    imageCreateInfo.flags = 0;
508
    imageCreateInfo.imageType = VK_IMAGE_TYPE_3D;
509
    imageCreateInfo.format = format;
510
    imageCreateInfo.extent.width = width;
511
    imageCreateInfo.extent.height = height;
512
    imageCreateInfo.extent.depth = depth;
513
    imageCreateInfo.mipLevels = 1;
514
    imageCreateInfo.arrayLayers = 1;
515
    imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
516
    imageCreateInfo.tiling = tiling;
517
    imageCreateInfo.usage = usage;
518
    imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
519
    imageCreateInfo.queueFamilyIndexCount = 0;
520
    imageCreateInfo.pQueueFamilyIndices = 0;
521
    imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
522

523
    VkImage image;
524
    VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
525
    if (ret != VK_SUCCESS)
526
    {
527
        NCNN_LOGE("vkCreateImage failed %d %d %d %d %d %d %d", ret, width, height, depth, format, tiling, usage);
528
        return 0;
529
    }
530

531
    return image;
532
}
533

534
VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
535
{
536
    VkImageViewCreateInfo imageViewCreateInfo;
537
    imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
538
    imageViewCreateInfo.pNext = 0;
539
    imageViewCreateInfo.flags = 0;
540
    imageViewCreateInfo.image = image;
541
    imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_3D;
542
    imageViewCreateInfo.format = format;
543
    imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
544
    imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
545
    imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
546
    imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
547
    imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
548
    imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
549
    imageViewCreateInfo.subresourceRange.levelCount = 1;
550
    imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
551
    imageViewCreateInfo.subresourceRange.layerCount = 1;
552

553
    VkImageView imageview;
554
    VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
555
    if (ret != VK_SUCCESS)
556
    {
557
        NCNN_LOGE("vkCreateImageView failed %d", ret);
558
        return 0;
559
    }
560

561
    return imageview;
562
}
563

564
static inline size_t least_common_multiple(size_t a, size_t b)
565
{
566
    if (a == b)
567
        return a;
568

569
    if (a > b)
570
        return least_common_multiple(b, a);
571

572
    size_t lcm = b;
573
    while (lcm % a != 0)
574
    {
575
        lcm += b;
576
    }
577

578
    return lcm;
579
}
580

581
class VkBlobAllocatorPrivate
582
{
583
public:
584
    size_t block_size;
585
    size_t buffer_offset_alignment;
586
    size_t bind_memory_offset_alignment;
587
    std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
588
    std::vector<VkBufferMemory*> buffer_blocks;
589
    std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
590
    std::vector<VkDeviceMemory> image_memory_blocks;
591
};
592

593
VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
594
    : VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
595
{
596
    d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
597
    d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
598

599
    if (vkdev->info.type() == 1)
600
    {
601
        // on integrated gpu, there may be device local only memory too, eg. AMD APU
602
        // assuming larger alignment always keeps us safe :)
603

604
        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
605
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
606
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
607
    }
608

609
    d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
610
}
611

612
VkBlobAllocator::~VkBlobAllocator()
613
{
614
    clear();
615

616
    delete d;
617
}
618

619
VkBlobAllocator::VkBlobAllocator(const VkBlobAllocator&)
620
    : VkAllocator(0), d(0)
621
{
622
}
623

624
VkBlobAllocator& VkBlobAllocator::operator=(const VkBlobAllocator&)
625
{
626
    return *this;
627
}
628

629
void VkBlobAllocator::clear()
630
{
631
    //     NCNN_LOGE("VkBlobAllocator %lu", buffer_blocks.size());
632

633
    for (size_t i = 0; i < d->buffer_blocks.size(); i++)
634
    {
635
        VkBufferMemory* ptr = d->buffer_blocks[i];
636

637
        //         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
638
        //         while (it != buffer_budgets[i].end())
639
        //         {
640
        //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
641
        //             it++;
642
        //         }
643

644
        if (mappable)
645
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
646

647
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
648
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
649

650
        delete ptr;
651
    }
652
    d->buffer_blocks.clear();
653

654
    d->buffer_budgets.clear();
655

656
    for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
657
    {
658
        VkDeviceMemory memory = d->image_memory_blocks[i];
659

660
        //         std::list< std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
661
        //         while (it != d->image_memory_budgets[i].end())
662
        //         {
663
        //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
664
        //             it++;
665
        //         }
666

667
        vkFreeMemory(vkdev->vkdevice(), memory, 0);
668
    }
669
    d->image_memory_blocks.clear();
670

671
    d->image_memory_budgets.clear();
672
}
673

674
VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
675
{
676
    size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
677

678
    const int buffer_block_count = d->buffer_blocks.size();
679

680
    // find first spare space in buffer_blocks
681
    for (int i = 0; i < buffer_block_count; i++)
682
    {
683
        std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[i].begin();
684
        while (it != d->buffer_budgets[i].end())
685
        {
686
            size_t budget_size = it->second;
687
            if (budget_size < aligned_size)
688
            {
689
                it++;
690
                continue;
691
            }
692

693
            // return sub buffer
694
            VkBufferMemory* ptr = new VkBufferMemory;
695

696
            ptr->buffer = d->buffer_blocks[i]->buffer;
697
            ptr->offset = it->first;
698
            ptr->memory = d->buffer_blocks[i]->memory;
699
            ptr->capacity = aligned_size;
700
            ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
701
            ptr->access_flags = 0;
702
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
703

704
            // adjust buffer_budgets
705
            if (budget_size == aligned_size)
706
            {
707
                d->buffer_budgets[i].erase(it);
708
            }
709
            else
710
            {
711
                it->first += aligned_size;
712
                it->second -= aligned_size;
713
            }
714

715
            //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
716

717
            return ptr;
718
        }
719
    }
720

721
    size_t new_block_size = std::max(d->block_size, aligned_size);
722

723
    // create new block
724
    VkBufferMemory* block = new VkBufferMemory;
725

726
    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
727
    block->offset = 0;
728

729
    // TODO respect VK_KHR_dedicated_allocation ?
730

731
    VkMemoryRequirements memoryRequirements;
732
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
733

734
    // setup memory type and alignment
735
    if (buffer_memory_type_index == (uint32_t)-1)
736
    {
737
        if (vkdev->info.type() == 1)
738
        {
739
            // integrated gpu, prefer unified memory
740
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
741

742
            // on amd integrated gpu, there is a faster and larger device-only heap
743
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
744
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
745
            uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
746
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
747
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
748
            {
749
                buffer_memory_type_index = device_local_memory_type_index;
750
            }
751
        }
752
        else
753
        {
754
            // discrete gpu, device local
755
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
756
        }
757

758
        mappable = vkdev->is_mappable(buffer_memory_type_index);
759
        coherent = vkdev->is_coherent(buffer_memory_type_index);
760
    }
761

762
    block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
763

764
    // ignore memoryRequirements.alignment as we always bind at zero offset
765
    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
766

767
    block->mapped_ptr = 0;
768
    if (mappable)
769
    {
770
        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
771
    }
772

773
    d->buffer_blocks.push_back(block);
774

775
    // return sub buffer
776
    VkBufferMemory* ptr = new VkBufferMemory;
777

778
    ptr->buffer = block->buffer;
779
    ptr->offset = 0;
780
    ptr->memory = block->memory;
781
    ptr->capacity = aligned_size;
782
    ptr->mapped_ptr = block->mapped_ptr;
783
    ptr->access_flags = 0;
784
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
785

786
    // adjust buffer_budgets
787
    std::list<std::pair<size_t, size_t> > budget;
788
    if (new_block_size > aligned_size)
789
    {
790
        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
791
    }
792
    d->buffer_budgets.push_back(budget);
793

794
    //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
795

796
    return ptr;
797
}
798

799
void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
800
{
801
    //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
802

803
    const int buffer_block_count = d->buffer_blocks.size();
804

805
    int block_index = -1;
806
    for (int i = 0; i < buffer_block_count; i++)
807
    {
808
        if (d->buffer_blocks[i]->buffer == ptr->buffer && d->buffer_blocks[i]->memory == ptr->memory)
809
        {
810
            block_index = i;
811
            break;
812
        }
813
    }
814

815
    if (block_index == -1)
816
    {
817
        NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);
818

819
        delete ptr;
820

821
        return;
822
    }
823

824
    // merge
825
    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->buffer_budgets[block_index].end();
826
    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->buffer_budgets[block_index].end();
827
    std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[block_index].begin();
828
    for (; it != d->buffer_budgets[block_index].end(); it++)
829
    {
830
        if (it->first + it->second == ptr->offset)
831
        {
832
            it_merge_left = it;
833
        }
834
        else if (ptr->offset + ptr->capacity == it->first)
835
        {
836
            it_merge_right = it;
837
        }
838
    }
839

840
    if (it_merge_left != d->buffer_budgets[block_index].end() && it_merge_right != d->buffer_budgets[block_index].end())
841
    {
842
        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
843
        d->buffer_budgets[block_index].erase(it_merge_right);
844
    }
845
    else if (it_merge_left != d->buffer_budgets[block_index].end())
846
    {
847
        it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
848
    }
849
    else if (it_merge_right != d->buffer_budgets[block_index].end())
850
    {
851
        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
852
        it_merge_right->first = ptr->offset;
853
    }
854
    else
855
    {
856
        if (ptr->offset == 0)
857
        {
858
            // chain leading block
859
            d->buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
860
        }
861
        else
862
        {
863
            d->buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
864
        }
865
    }
866

867
    delete ptr;
868
}
869

870
VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
871
{
872
    if (elempack != 1 && elempack != 4 && elempack != 8)
873
    {
874
        NCNN_LOGE("elempack must be 1 4 8");
875
        return 0;
876
    }
877

878
    // resolve format
879
    VkFormat format = VK_FORMAT_UNDEFINED;
880

881
    if (elemsize / elempack == 4)
882
    {
883
        // fp32
884
        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
885
        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
886
        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
887
    }
888
    if (elemsize / elempack == 2)
889
    {
890
        // fp16
891
        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
892
        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
893
        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
894
    }
895

896
    // resolve image width height depth
897
    int width = w;
898
    int height = h;
899
    int depth = c;
900

901
    // large elempack spills on image w
902
    if (elempack == 8) width *= 2;
903

904
    if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
905
    {
906
        NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
907
        return 0;
908
    }
909

910
    VkImageMemory* ptr = new VkImageMemory;
911

912
    ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
913

914
    ptr->width = width;
915
    ptr->height = height;
916
    ptr->depth = depth;
917
    ptr->format = format;
918

919
    // TODO respect VK_KHR_dedicated_allocation ?
920
    VkMemoryRequirements memoryRequirements;
921
    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
922

923
    const size_t size = memoryRequirements.size;
924
    const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
925

926
    size_t aligned_size = alignSize(size, alignment);
927

928
    const int image_memory_block_count = d->image_memory_blocks.size();
929

930
    // find first spare space in image_memory_blocks
931
    for (int i = 0; i < image_memory_block_count; i++)
932
    {
933
#if __APPLE__
934
        // HACK moltenvk v1.2.3 is unhappy for image binding with offset  :(
935
        break;
936
#endif
937

938
        std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
939
        while (it != d->image_memory_budgets[i].end())
940
        {
941
            // we cannot use it->first directly for base offset alignment
942
            size_t bind_base_offset = it->first;
943
            size_t bind_offset = alignSize(bind_base_offset, alignment);
944
            size_t budget_size = it->second;
945
            if (budget_size < aligned_size + (bind_offset - bind_base_offset))
946
            {
947
                it++;
948
                continue;
949
            }
950

951
            // bind at memory offset
952
            ptr->memory = d->image_memory_blocks[i];
953
            ptr->bind_offset = bind_offset;
954
            ptr->bind_capacity = aligned_size;
955

956
            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
957

958
            // do not allow host access to optimal tiling image
959
            ptr->mapped_ptr = 0;
960

961
            ptr->imageview = create_imageview(ptr->image, format);
962

963
            ptr->access_flags = 0;
964
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
965
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
966
            ptr->command_refcount = 0;
967

968
            if (bind_base_offset != bind_offset)
969
            {
970
                // NOTE there is small offset inside bind_base_offset and bind_offset
971
                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
972
                // so that memory management could be easier
973
                aligned_size += (bind_offset - bind_base_offset);
974

975
                ptr->bind_offset = bind_base_offset;
976
                ptr->bind_capacity = aligned_size;
977
            }
978

979
            // adjust image_memory_budgets
980
            if (budget_size == aligned_size)
981
            {
982
                d->image_memory_budgets[i].erase(it);
983
            }
984
            else
985
            {
986
                it->first += aligned_size;
987
                it->second -= aligned_size;
988
            }
989

990
            //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
991

992
            return ptr;
993
        }
994
    }
995

996
    // setup memory type and alignment
997
    if (image_memory_type_index == (uint32_t)-1)
998
    {
999
        if (vkdev->info.type() == 1)
1000
        {
1001
            // integrated gpu, prefer unified memory
1002
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1003

1004
            // on amd integrated gpu, there is a faster and larger device-only heap
1005
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1006
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1007
            uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1008
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1009
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1010
            {
1011
                image_memory_type_index = device_local_memory_type_index;
1012
            }
1013
        }
1014
        else
1015
        {
1016
            // discrete gpu, device local
1017
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1018
        }
1019

1020
        mappable = vkdev->is_mappable(image_memory_type_index);
1021
        coherent = vkdev->is_coherent(image_memory_type_index);
1022
    }
1023

1024
    // create new block
1025
    size_t new_block_size = std::max(d->block_size, aligned_size);
1026

1027
#if __APPLE__
1028
    // HACK moltenvk v1.2.3 is unhappy for image binding with offset
1029
    // always ignore block size for smaller memory footprint :(
1030
    new_block_size = aligned_size;
1031
#endif
1032

1033
    // bind at memory offset
1034
    ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1035
    ptr->bind_offset = 0;
1036
    ptr->bind_capacity = aligned_size;
1037

1038
    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1039
    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1040

1041
    // do not allow host access to optimal tiling image
1042
    ptr->mapped_ptr = 0;
1043

1044
    ptr->imageview = create_imageview(ptr->image, format);
1045

1046
    ptr->access_flags = 0;
1047
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1048
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1049
    ptr->command_refcount = 0;
1050

1051
    // adjust image_memory_budgets
1052
    d->image_memory_blocks.push_back(ptr->memory);
1053

1054
    std::list<std::pair<size_t, size_t> > budget;
1055
    if (new_block_size > aligned_size)
1056
    {
1057
        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
1058
    }
1059
    d->image_memory_budgets.push_back(budget);
1060

1061
    //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
1062

1063
    return ptr;
1064
}
1065

1066
void VkBlobAllocator::fastFree(VkImageMemory* ptr)
1067
{
1068
    //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
1069

1070
    const int image_memory_block_count = d->image_memory_blocks.size();
1071

1072
    int block_index = -1;
1073
    for (int i = 0; i < image_memory_block_count; i++)
1074
    {
1075
        if (d->image_memory_blocks[i] == ptr->memory)
1076
        {
1077
            block_index = i;
1078
            break;
1079
        }
1080
    }
1081

1082
    if (block_index == -1)
1083
    {
1084
        NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->memory);
1085

1086
        if (!ptr->command_refcount)
1087
        {
1088
            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1089
            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1090

1091
            delete ptr;
1092
        }
1093

1094
        return;
1095
    }
1096

1097
    // merge
1098
    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->image_memory_budgets[block_index].end();
1099
    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->image_memory_budgets[block_index].end();
1100
    std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[block_index].begin();
1101
    for (; it != d->image_memory_budgets[block_index].end(); it++)
1102
    {
1103
        if (it->first + it->second == ptr->bind_offset)
1104
        {
1105
            it_merge_left = it;
1106
        }
1107
        else if (ptr->bind_offset + ptr->bind_capacity == it->first)
1108
        {
1109
            it_merge_right = it;
1110
        }
1111
    }
1112

1113
    if (it_merge_left != d->image_memory_budgets[block_index].end() && it_merge_right != d->image_memory_budgets[block_index].end())
1114
    {
1115
        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
1116
        d->image_memory_budgets[block_index].erase(it_merge_right);
1117
    }
1118
    else if (it_merge_left != d->image_memory_budgets[block_index].end())
1119
    {
1120
        it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
1121
    }
1122
    else if (it_merge_right != d->image_memory_budgets[block_index].end())
1123
    {
1124
        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
1125
        it_merge_right->first = ptr->bind_offset;
1126
    }
1127
    else
1128
    {
1129
        if (ptr->bind_offset == 0)
1130
        {
1131
            // chain leading block
1132
            d->image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1133
        }
1134
        else
1135
        {
1136
            d->image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1137
        }
1138
    }
1139

1140
    if (!ptr->command_refcount)
1141
    {
1142
        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1143
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1144

1145
        delete ptr;
1146
    }
1147
}
1148

1149
class VkWeightAllocatorPrivate
1150
{
1151
public:
1152
    size_t block_size;
1153
    size_t buffer_offset_alignment;
1154
    size_t bind_memory_offset_alignment;
1155
    std::vector<size_t> buffer_block_free_spaces;
1156
    std::vector<VkBufferMemory*> buffer_blocks;
1157
    std::vector<VkBufferMemory*> dedicated_buffer_blocks;
1158
    std::vector<size_t> image_memory_block_free_spaces;
1159
    std::vector<VkDeviceMemory> image_memory_blocks;
1160
    std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
1161
};
1162

1163
VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
1164
    : VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
1165
{
1166
    d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
1167
    d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
1168

1169
    if (vkdev->info.type() == 1)
1170
    {
1171
        // on integrated gpu, there may be device local only memory too, eg. AMD APU
1172
        // assuming larger alignment always keeps us safe :)
1173

1174
        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
1175
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
1176
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
1177
    }
1178

1179
    d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
1180
}
1181

1182
VkWeightAllocator::~VkWeightAllocator()
1183
{
1184
    clear();
1185

1186
    delete d;
1187
}
1188

1189
VkWeightAllocator::VkWeightAllocator(const VkWeightAllocator&)
1190
    : VkAllocator(0), d(0)
1191
{
1192
}
1193

1194
VkWeightAllocator& VkWeightAllocator::operator=(const VkWeightAllocator&)
1195
{
1196
    return *this;
1197
}
1198

1199
void VkWeightAllocator::clear()
1200
{
1201
    //     NCNN_LOGE("VkWeightAllocator %lu %lu", d->buffer_blocks.size(), d->dedicated_buffer_blocks.size());
1202

1203
    d->buffer_block_free_spaces.clear();
1204

1205
    for (size_t i = 0; i < d->buffer_blocks.size(); i++)
1206
    {
1207
        VkBufferMemory* ptr = d->buffer_blocks[i];
1208

1209
        if (mappable)
1210
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1211

1212
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1213
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1214

1215
        delete ptr;
1216
    }
1217
    d->buffer_blocks.clear();
1218

1219
    for (size_t i = 0; i < d->dedicated_buffer_blocks.size(); i++)
1220
    {
1221
        VkBufferMemory* ptr = d->dedicated_buffer_blocks[i];
1222

1223
        if (mappable)
1224
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1225

1226
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1227
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1228

1229
        delete ptr;
1230
    }
1231
    d->dedicated_buffer_blocks.clear();
1232

1233
    d->image_memory_block_free_spaces.clear();
1234

1235
    for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
1236
    {
1237
        VkDeviceMemory memory = d->image_memory_blocks[i];
1238

1239
        vkFreeMemory(vkdev->vkdevice(), memory, 0);
1240
    }
1241
    d->image_memory_blocks.clear();
1242

1243
    for (size_t i = 0; i < d->dedicated_image_memory_blocks.size(); i++)
1244
    {
1245
        VkDeviceMemory memory = d->dedicated_image_memory_blocks[i];
1246

1247
        vkFreeMemory(vkdev->vkdevice(), memory, 0);
1248
    }
1249
    d->dedicated_image_memory_blocks.clear();
1250
}
1251

1252
VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
1253
{
1254
    //     NCNN_LOGE("VkWeightAllocator fastMalloc %lu", size);
1255

1256
    size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
1257

1258
    const int buffer_block_count = d->buffer_blocks.size();
1259

1260
    // find first spare space in buffer_blocks
1261
    for (int i = 0; i < buffer_block_count; i++)
1262
    {
1263
        size_t free_size = d->buffer_block_free_spaces[i];
1264
        if (free_size >= aligned_size)
1265
        {
1266
            size_t block_offset = d->block_size - free_size;
1267

1268
            // return sub buffer
1269
            VkBufferMemory* ptr = new VkBufferMemory;
1270

1271
            ptr->buffer = d->buffer_blocks[i]->buffer;
1272
            ptr->offset = block_offset;
1273
            ptr->memory = d->buffer_blocks[i]->memory;
1274
            ptr->capacity = aligned_size;
1275
            ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
1276
            ptr->access_flags = 0;
1277
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1278

1279
            d->buffer_block_free_spaces[i] -= aligned_size;
1280

1281
            return ptr;
1282
        }
1283
    }
1284

1285
    size_t new_block_size = std::max(d->block_size, aligned_size);
1286

1287
    // create new block
1288
    VkBufferMemory* block = new VkBufferMemory;
1289

1290
    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1291
    block->offset = 0;
1292

1293
    if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1294
    {
1295
        VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
1296
        bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
1297
        bufferMemoryRequirementsInfo2.pNext = 0;
1298
        bufferMemoryRequirementsInfo2.buffer = block->buffer;
1299

1300
        VkMemoryRequirements2KHR memoryRequirements2;
1301
        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1302
        memoryRequirements2.pNext = 0;
1303

1304
        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1305
        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1306
        memoryDedicatedRequirements.pNext = 0;
1307
        memoryRequirements2.pNext = &memoryDedicatedRequirements;
1308

1309
        vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
1310

1311
        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1312

1313
        if (dedicatedAllocation)
1314
        {
1315
            // setup memory type and alignment
1316
            if (buffer_memory_type_index == (uint32_t)-1)
1317
            {
1318
                if (vkdev->info.type() == 1)
1319
                {
1320
                    // integrated gpu, prefer unified memory
1321
                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1322

1323
                    // on amd integrated gpu, there is a faster and larger device-only heap
1324
                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1325
                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1326
                    uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
1327
                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1328
                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1329
                    {
1330
                        buffer_memory_type_index = device_local_memory_type_index;
1331
                    }
1332
                }
1333
                else
1334
                {
1335
                    // discrete gpu, device local
1336
                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1337
                }
1338

1339
                mappable = vkdev->is_mappable(buffer_memory_type_index);
1340
                coherent = vkdev->is_coherent(buffer_memory_type_index);
1341
            }
1342

1343
            block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
1344

1345
            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1346
            vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1347

1348
            block->mapped_ptr = 0;
1349
            if (mappable)
1350
            {
1351
                vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1352
            }
1353

1354
            d->dedicated_buffer_blocks.push_back(block);
1355

1356
            // return sub buffer
1357
            VkBufferMemory* ptr = new VkBufferMemory;
1358

1359
            ptr->buffer = block->buffer;
1360
            ptr->offset = 0;
1361
            ptr->memory = block->memory;
1362
            ptr->capacity = new_block_size;
1363
            ptr->mapped_ptr = block->mapped_ptr;
1364
            ptr->access_flags = 0;
1365
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1366

1367
            return ptr;
1368
        }
1369
    }
1370

1371
    VkMemoryRequirements memoryRequirements;
1372
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
1373

1374
    // setup memory type and alignment
1375
    if (buffer_memory_type_index == (uint32_t)-1)
1376
    {
1377
        if (vkdev->info.type() == 1)
1378
        {
1379
            // integrated gpu, prefer unified memory
1380
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1381

1382
            // on amd integrated gpu, there is a faster and larger device-only heap
1383
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1384
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1385
            uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
1386
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1387
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1388
            {
1389
                buffer_memory_type_index = device_local_memory_type_index;
1390
            }
1391
        }
1392
        else
1393
        {
1394
            // discrete gpu, device local
1395
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1396
        }
1397

1398
        mappable = vkdev->is_mappable(buffer_memory_type_index);
1399
        coherent = vkdev->is_coherent(buffer_memory_type_index);
1400
    }
1401

1402
    block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1403

1404
    // ignore memoryRequirements.alignment as we always bind at zero offset
1405
    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1406

1407
    //     NCNN_LOGE("VkWeightAllocator M %p", block->buffer);
1408

1409
    block->mapped_ptr = 0;
1410
    if (mappable)
1411
    {
1412
        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1413
    }
1414

1415
    d->buffer_blocks.push_back(block);
1416

1417
    d->buffer_block_free_spaces.push_back(new_block_size - aligned_size);
1418

1419
    // return sub buffer
1420
    VkBufferMemory* ptr = new VkBufferMemory;
1421

1422
    ptr->buffer = block->buffer;
1423
    ptr->offset = 0;
1424
    ptr->memory = block->memory;
1425
    ptr->capacity = aligned_size;
1426
    ptr->mapped_ptr = block->mapped_ptr;
1427
    ptr->access_flags = 0;
1428
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1429

1430
    return ptr;
1431
}
1432

1433
void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
1434
{
1435
    //     NCNN_LOGE("VkWeightAllocator F %p", ptr->buffer);
1436

1437
    delete ptr;
1438
}
1439

1440
VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
1441
{
1442
    if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
1443
    {
1444
        NCNN_LOGE("elempack must be 1 4 8 16 32 64");
1445
        return 0;
1446
    }
1447

1448
    // resolve format
1449
    VkFormat format = VK_FORMAT_UNDEFINED;
1450

1451
    if (elemsize / elempack == 4)
1452
    {
1453
        // fp32
1454
        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
1455
        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1456
        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1457
        if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1458
        if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1459
        if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1460
    }
1461
    if (elemsize / elempack == 2)
1462
    {
1463
        // fp16
1464
        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
1465
        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1466
        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1467
        if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1468
        if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1469
        if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1470
    }
1471

1472
    // resolve image width height depth
1473
    int width = w;
1474
    int height = h;
1475
    int depth = c;
1476

1477
    // large elempack spills on image w
1478
    if (elempack == 8) width *= 2;
1479
    if (elempack == 16) width *= 4;
1480
    if (elempack == 32) width *= 8;
1481
    if (elempack == 64) width *= 16;
1482

1483
    if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
1484
    {
1485
        NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
1486
        return 0;
1487
    }
1488

1489
    VkImageMemory* ptr = new VkImageMemory;
1490

1491
    ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
1492

1493
    ptr->width = width;
1494
    ptr->height = height;
1495
    ptr->depth = depth;
1496
    ptr->format = format;
1497

1498
    if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1499
    {
1500
        VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
1501
        imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
1502
        imageMemoryRequirementsInfo2.pNext = 0;
1503
        imageMemoryRequirementsInfo2.image = ptr->image;
1504

1505
        VkMemoryRequirements2KHR memoryRequirements2;
1506
        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1507
        memoryRequirements2.pNext = 0;
1508

1509
        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1510
        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1511
        memoryDedicatedRequirements.pNext = 0;
1512
        memoryRequirements2.pNext = &memoryDedicatedRequirements;
1513

1514
        vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);
1515

1516
        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1517

1518
        if (dedicatedAllocation)
1519
        {
1520
            // setup memory type and alignment
1521
            if (image_memory_type_index == (uint32_t)-1)
1522
            {
1523
                if (vkdev->info.type() == 1)
1524
                {
1525
                    // integrated gpu, prefer unified memory
1526
                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1527

1528
                    // on amd integrated gpu, there is a faster and larger device-only heap
1529
                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1530
                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1531
                    uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1532
                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1533
                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1534
                    {
1535
                        image_memory_type_index = device_local_memory_type_index;
1536
                    }
1537
                }
1538
                else
1539
                {
1540
                    // discrete gpu, device local
1541
                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1542
                }
1543

1544
                mappable = vkdev->is_mappable(image_memory_type_index);
1545
                coherent = vkdev->is_coherent(image_memory_type_index);
1546
            }
1547

1548
            // bind memory
1549
            ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
1550
            ptr->bind_offset = 0;
1551
            ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;
1552

1553
            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1554
            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1555

1556
            // do not allow host access to optimal tiling image
1557
            ptr->mapped_ptr = 0;
1558

1559
            ptr->imageview = create_imageview(ptr->image, format);
1560

1561
            ptr->access_flags = 0;
1562
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1563
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1564
            ptr->command_refcount = 0;
1565

1566
            d->dedicated_image_memory_blocks.push_back(ptr->memory);
1567

1568
            return ptr;
1569
        }
1570
    }
1571

1572
    VkMemoryRequirements memoryRequirements;
1573
    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
1574

1575
    const size_t size = memoryRequirements.size;
1576
    const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
1577

1578
    size_t aligned_size = alignSize(size, alignment);
1579

1580
    const int image_memory_block_count = d->image_memory_blocks.size();
1581

1582
    // find first spare space in buffer_blocks
1583
    for (int i = 0; i < image_memory_block_count; i++)
1584
    {
1585
        // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
1586
        size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i];
1587
        size_t bind_offset = alignSize(bind_base_offset, alignment);
1588
        if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
1589
        {
1590
            // bind at memory offset
1591
            ptr->memory = d->image_memory_blocks[i];
1592
            ptr->bind_offset = bind_offset;
1593
            ptr->bind_capacity = aligned_size;
1594

1595
            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1596

1597
            // do not allow host access to optimal tiling image
1598
            ptr->mapped_ptr = 0;
1599

1600
            ptr->imageview = create_imageview(ptr->image, format);
1601

1602
            ptr->access_flags = 0;
1603
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1604
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1605
            ptr->command_refcount = 0;
1606

1607
            if (bind_base_offset != bind_offset)
1608
            {
1609
                // NOTE there is small offset inside bind_base_offset and bind_offset
1610
                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
1611
                // so that memory management could be easier
1612
                aligned_size += (bind_offset - bind_base_offset);
1613

1614
                ptr->bind_offset = bind_base_offset;
1615
                ptr->bind_capacity = aligned_size;
1616
            }
1617

1618
            d->image_memory_block_free_spaces[i] -= aligned_size;
1619

1620
            return ptr;
1621
        }
1622
    }
1623

1624
    // setup memory type and alignment
1625
    if (image_memory_type_index == (uint32_t)-1)
1626
    {
1627
        if (vkdev->info.type() == 1)
1628
        {
1629
            // integrated gpu, prefer unified memory
1630
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1631

1632
            // on amd integrated gpu, there is a faster and larger device-only heap
1633
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1634
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1635
            uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1636
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1637
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1638
            {
1639
                image_memory_type_index = device_local_memory_type_index;
1640
            }
1641
        }
1642
        else
1643
        {
1644
            // discrete gpu, device local
1645
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1646
        }
1647

1648
        mappable = vkdev->is_mappable(image_memory_type_index);
1649
        coherent = vkdev->is_coherent(image_memory_type_index);
1650
    }
1651

1652
    // create new block
1653
    size_t new_block_size = std::max(d->block_size, aligned_size);
1654

1655
    // bind at memory offset
1656
    ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1657
    ptr->bind_offset = 0;
1658
    ptr->bind_capacity = aligned_size;
1659

1660
    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1661
    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1662

1663
    // do not allow host access to optimal tiling image
1664
    ptr->mapped_ptr = 0;
1665

1666
    ptr->imageview = create_imageview(ptr->image, format);
1667

1668
    ptr->access_flags = 0;
1669
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1670
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1671
    ptr->command_refcount = 0;
1672

1673
    d->image_memory_blocks.push_back(ptr->memory);
1674
    d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size);
1675

1676
    return ptr;
1677
}
1678

1679
void VkWeightAllocator::fastFree(VkImageMemory* ptr)
1680
{
1681
    //     NCNN_LOGE("VkWeightAllocator F %p", ptr->memory);
1682

1683
    if (!ptr->command_refcount)
1684
    {
1685
        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1686
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1687

1688
        delete ptr;
1689
    }
1690
}
1691

1692
class VkStagingAllocatorPrivate
1693
{
1694
public:
1695
    unsigned int size_compare_ratio; // 0~256
1696
    std::list<VkBufferMemory*> buffer_budgets;
1697
};
1698

1699
VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev)
1700
    : VkAllocator(_vkdev), d(new VkStagingAllocatorPrivate)
1701
{
1702
    mappable = true;
1703
    coherent = true;
1704

1705
    d->size_compare_ratio = 192; // 0.75f * 256
1706
}
1707

1708
VkStagingAllocator::~VkStagingAllocator()
1709
{
1710
    clear();
1711

1712
    delete d;
1713
}
1714

1715
VkStagingAllocator::VkStagingAllocator(const VkStagingAllocator&)
1716
    : VkAllocator(0), d(0)
1717
{
1718
}
1719

1720
VkStagingAllocator& VkStagingAllocator::operator=(const VkStagingAllocator&)
1721
{
1722
    return *this;
1723
}
1724

1725
void VkStagingAllocator::set_size_compare_ratio(float scr)
1726
{
1727
    if (scr < 0.f || scr > 1.f)
1728
    {
1729
        NCNN_LOGE("invalid size compare ratio %f", scr);
1730
        return;
1731
    }
1732

1733
    d->size_compare_ratio = (unsigned int)(scr * 256);
1734
}
1735

1736
void VkStagingAllocator::clear()
1737
{
1738
    //     NCNN_LOGE("VkStagingAllocator %lu", buffer_budgets.size());
1739

1740
    for (std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin(); it != d->buffer_budgets.end(); it++)
1741
    {
1742
        VkBufferMemory* ptr = *it;
1743

1744
        //         NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1745

1746
        vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1747
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1748
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1749

1750
        delete ptr;
1751
    }
1752
    d->buffer_budgets.clear();
1753
}
1754

1755
VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
1756
{
1757
    // find free budget
1758
    std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin();
1759
    for (; it != d->buffer_budgets.end(); it++)
1760
    {
1761
        VkBufferMemory* ptr = *it;
1762

1763
        size_t capacity = ptr->capacity;
1764

1765
        // size_compare_ratio ~ 100%
1766
        if (capacity >= size && ((capacity * d->size_compare_ratio) >> 8) <= size)
1767
        {
1768
            d->buffer_budgets.erase(it);
1769

1770
            //             NCNN_LOGE("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
1771

1772
            return ptr;
1773
        }
1774
    }
1775

1776
    VkBufferMemory* ptr = new VkBufferMemory;
1777

1778
    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1779
    ptr->offset = 0;
1780

1781
    VkMemoryRequirements memoryRequirements;
1782
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1783

1784
    // setup memory type
1785
    if (buffer_memory_type_index == (uint32_t)-1)
1786
    {
1787
        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1788
    }
1789

1790
    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1791

1792
    // ignore memoryRequirements.alignment as we always bind at zero offset
1793
    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1794

1795
    ptr->capacity = size;
1796

1797
    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1798

1799
    ptr->access_flags = 0;
1800
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1801

1802
    //     NCNN_LOGE("VkStagingAllocator M %p %lu", ptr->buffer, size);
1803

1804
    return ptr;
1805
}
1806

1807
void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
1808
{
1809
    //     NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1810

1811
    // return to buffer_budgets
1812
    d->buffer_budgets.push_back(ptr);
1813
}
1814

1815
VkImageMemory* VkStagingAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int /* elempack */)
1816
{
1817
    // staging image is mainly used for storing small piece of dynamic parameters
1818
    // we allocate host memory as a fake image, it's simple and good
1819

1820
    const size_t size = w * h * c * elemsize;
1821

1822
    VkImageMemory* ptr = new VkImageMemory;
1823

1824
    ptr->image = 0;
1825
    ptr->width = w;
1826
    ptr->height = h;
1827
    ptr->depth = c;
1828
    ptr->format = VK_FORMAT_UNDEFINED;
1829
    ptr->memory = 0;
1830
    ptr->bind_offset = 0;
1831
    ptr->bind_capacity = size;
1832

1833
    ptr->mapped_ptr = malloc(size);
1834

1835
    ptr->imageview = 0;
1836

1837
    ptr->access_flags = 0;
1838
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1839
    ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
1840
    ptr->command_refcount = 0;
1841

1842
    //     NCNN_LOGE("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
1843

1844
    return ptr;
1845
}
1846

1847
void VkStagingAllocator::fastFree(VkImageMemory* ptr)
1848
{
1849
    //     NCNN_LOGE("VkStagingAllocator F %p", ptr->image);
1850

1851
    free(ptr->mapped_ptr);
1852

1853
    delete ptr;
1854
}
1855

1856
class VkWeightStagingAllocatorPrivate
1857
{
1858
public:
1859
};
1860

1861
VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev)
1862
    : VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
1863
{
1864
    mappable = true;
1865
    coherent = true;
1866
}
1867

1868
VkWeightStagingAllocator::~VkWeightStagingAllocator()
1869
{
1870
    delete d;
1871
}
1872

1873
VkWeightStagingAllocator::VkWeightStagingAllocator(const VkWeightStagingAllocator&)
1874
    : VkAllocator(0), d(0)
1875
{
1876
}
1877

1878
VkWeightStagingAllocator& VkWeightStagingAllocator::operator=(const VkWeightStagingAllocator&)
1879
{
1880
    return *this;
1881
}
1882

1883
VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
1884
{
1885
    VkBufferMemory* ptr = new VkBufferMemory;
1886

1887
    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1888
    ptr->offset = 0;
1889

1890
    VkMemoryRequirements memoryRequirements;
1891
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1892

1893
    // setup memory type
1894
    if (buffer_memory_type_index == (uint32_t)-1)
1895
    {
1896
        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1897
    }
1898

1899
    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1900

1901
    // ignore memoryRequirements.alignment as we always bind at zero offset
1902
    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1903

1904
    ptr->capacity = size;
1905

1906
    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1907

1908
    ptr->access_flags = 0;
1909
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1910

1911
    //     NCNN_LOGE("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
1912

1913
    return ptr;
1914
}
1915

1916
void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
1917
{
1918
    //     NCNN_LOGE("VkWeightStagingAllocator F %p", ptr->buffer);
1919

1920
    vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1921
    vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1922
    vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1923

1924
    delete ptr;
1925
}
1926

1927
VkImageMemory* VkWeightStagingAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1928
{
1929
    return 0;
1930
}
1931

1932
void VkWeightStagingAllocator::fastFree(VkImageMemory* /*ptr*/)
1933
{
1934
}
1935

1936
#if NCNN_PLATFORM_API
1937
#if __ANDROID_API__ >= 26
1938
VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb)
1939
    : VkAllocator(_vkdev), hb(_hb)
1940
{
1941
    samplerYcbcrConversion = 0;
1942

1943
    init();
1944
}
1945

1946
VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator()
1947
{
1948
    if (samplerYcbcrConversion)
1949
    {
1950
        vkdev->vkDestroySamplerYcbcrConversionKHR(vkdev->vkdevice(), samplerYcbcrConversion, 0);
1951
        samplerYcbcrConversion = 0;
1952
    }
1953
}
1954

1955
VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&)
1956
    : VkAllocator(0)
1957
{
1958
}
1959

1960
VkAndroidHardwareBufferImageAllocator& VkAndroidHardwareBufferImageAllocator::operator=(const VkAndroidHardwareBufferImageAllocator&)
1961
{
1962
    return *this;
1963
}
1964

1965
VkBufferMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(size_t /*size*/)
1966
{
1967
    return 0;
1968
}
1969

1970
void VkAndroidHardwareBufferImageAllocator::fastFree(VkBufferMemory* /*ptr*/)
1971
{
1972
}
1973

1974
VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1975
{
1976
    VkResult ret;
1977

1978
    VkExternalFormatANDROID externalFormat;
1979
    externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1980
    externalFormat.pNext = 0;
1981
    externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1982

1983
    VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo;
1984
    externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
1985
    externalMemoryImageCreateInfo.pNext = &externalFormat,
1986
    externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
1987

1988
    VkImageCreateInfo imageCreateInfo;
1989
    imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1990
    imageCreateInfo.pNext = &externalMemoryImageCreateInfo;
1991
    imageCreateInfo.flags = 0;
1992
    imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
1993
    imageCreateInfo.format = VK_FORMAT_UNDEFINED;
1994
    imageCreateInfo.extent.width = bufferDesc.width;
1995
    imageCreateInfo.extent.height = bufferDesc.height;
1996
    imageCreateInfo.extent.depth = 1;
1997
    imageCreateInfo.mipLevels = 1;
1998
    imageCreateInfo.arrayLayers = 1;
1999
    imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
2000
    imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
2001
    imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
2002
    imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
2003
    imageCreateInfo.queueFamilyIndexCount = 0;
2004
    imageCreateInfo.pQueueFamilyIndices = 0;
2005
    imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
2006

2007
    VkImage image = 0;
2008
    ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
2009
    if (ret != VK_SUCCESS)
2010
    {
2011
        NCNN_LOGE("vkCreateImage failed %d", ret);
2012
        return 0;
2013
    }
2014

2015
    // setup memory type
2016
    if (image_memory_type_index == (uint32_t)-1)
2017
    {
2018
        image_memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
2019
    }
2020

2021
    VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo;
2022
    importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
2023
    importAndroidHardwareBufferInfo.pNext = 0;
2024
    importAndroidHardwareBufferInfo.buffer = hb;
2025

2026
    VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo;
2027
    memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
2028
    memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo;
2029
    memoryDedicatedAllocateInfo.image = image;
2030
    memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
2031

2032
    VkMemoryAllocateInfo memoryAllocateInfo;
2033
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
2034
    memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
2035
    memoryAllocateInfo.allocationSize = bufferProperties.allocationSize;
2036
    memoryAllocateInfo.memoryTypeIndex = image_memory_type_index;
2037

2038
    VkDeviceMemory memory = 0;
2039
    ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
2040
    if (ret != VK_SUCCESS)
2041
    {
2042
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
2043
        return 0;
2044
    }
2045

2046
    VkBindImageMemoryInfo bindImageMemoryInfo;
2047
    bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
2048
    bindImageMemoryInfo.pNext = 0;
2049
    bindImageMemoryInfo.image = image;
2050
    bindImageMemoryInfo.memory = memory;
2051
    bindImageMemoryInfo.memoryOffset = 0;
2052
    ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo);
2053
    if (ret != VK_SUCCESS)
2054
    {
2055
        NCNN_LOGE("vkBindImageMemory2KHR failed %d", ret);
2056
        vkDestroyImage(vkdev->vkdevice(), image, 0);
2057
        return 0;
2058
    }
2059

2060
    VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo;
2061
    samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR;
2062
    samplerYcbcrConversionInfo.pNext = &externalFormat;
2063
    samplerYcbcrConversionInfo.conversion = samplerYcbcrConversion;
2064

2065
    VkImageViewCreateInfo imageViewCreateInfo;
2066
    imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
2067
    imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo;
2068
    imageViewCreateInfo.flags = 0;
2069
    imageViewCreateInfo.image = image;
2070
    imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
2071
    imageViewCreateInfo.format = VK_FORMAT_UNDEFINED;
2072
    imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
2073
    imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
2074
    imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
2075
    imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
2076
    imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
2077
    imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
2078
    imageViewCreateInfo.subresourceRange.levelCount = 1;
2079
    imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
2080
    imageViewCreateInfo.subresourceRange.layerCount = 1;
2081

2082
    VkImageView imageview = 0;
2083
    ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
2084
    if (ret != VK_SUCCESS)
2085
    {
2086
        NCNN_LOGE("vkCreateImageView failed %d", ret);
2087
        vkDestroyImage(vkdev->vkdevice(), image, 0);
2088
        vkFreeMemory(vkdev->vkdevice(), memory, 0);
2089
        return 0;
2090
    }
2091

2092
    VkImageMemory* ptr = new VkImageMemory;
2093
    ptr->image = image;
2094
    ptr->memory = memory;
2095
    ptr->imageview = imageview;
2096
    ptr->access_flags = 0;
2097
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
2098
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
2099

2100
    return ptr;
2101
}
2102

2103
void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr)
2104
{
2105
    vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
2106
    vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
2107
    vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
2108

2109
    delete ptr;
2110
}
2111

2112
int VkAndroidHardwareBufferImageAllocator::init()
2113
{
2114
    AHardwareBuffer_describe(hb, &bufferDesc);
2115

2116
    VkResult ret;
2117

2118
    // resolve externalFormat
2119
    bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID;
2120
    bufferFormatProperties.pNext = 0;
2121

2122
    bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
2123
    bufferProperties.pNext = &bufferFormatProperties;
2124

2125
    ret = vkdev->vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties);
2126
    if (ret != VK_SUCCESS)
2127
    {
2128
        NCNN_LOGE("vkGetAndroidHardwareBufferPropertiesANDROID failed %d", ret);
2129
        return -1;
2130
    }
2131

2132
    // setup samplerYcbcrConversion
2133
    VkExternalFormatANDROID externalFormat;
2134
    externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
2135
    externalFormat.pNext = 0;
2136
    externalFormat.externalFormat = bufferFormatProperties.externalFormat;
2137

2138
    VkSamplerYcbcrConversionCreateInfoKHR samplerYcbcrConversionCreateInfo;
2139
    samplerYcbcrConversionCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR;
2140
    samplerYcbcrConversionCreateInfo.pNext = &externalFormat;
2141
    samplerYcbcrConversionCreateInfo.format = VK_FORMAT_UNDEFINED;
2142
    samplerYcbcrConversionCreateInfo.ycbcrModel = bufferFormatProperties.suggestedYcbcrModel;
2143
    samplerYcbcrConversionCreateInfo.ycbcrRange = bufferFormatProperties.suggestedYcbcrRange;
2144
    samplerYcbcrConversionCreateInfo.components = bufferFormatProperties.samplerYcbcrConversionComponents;
2145
    samplerYcbcrConversionCreateInfo.xChromaOffset = bufferFormatProperties.suggestedXChromaOffset;
2146
    samplerYcbcrConversionCreateInfo.yChromaOffset = bufferFormatProperties.suggestedYChromaOffset;
2147
    samplerYcbcrConversionCreateInfo.chromaFilter = VK_FILTER_NEAREST;
2148
    samplerYcbcrConversionCreateInfo.forceExplicitReconstruction = VK_FALSE;
2149

2150
    ret = vkdev->vkCreateSamplerYcbcrConversionKHR(vkdev->vkdevice(), &samplerYcbcrConversionCreateInfo, 0, &samplerYcbcrConversion);
2151
    if (ret != VK_SUCCESS)
2152
    {
2153
        NCNN_LOGE("vkCreateSamplerYcbcrConversionKHR failed %d", ret);
2154
        return -1;
2155
    }
2156

2157
    return 0;
2158
}
2159

2160
int VkAndroidHardwareBufferImageAllocator::width() const
2161
{
2162
    return bufferDesc.width;
2163
}
2164

2165
int VkAndroidHardwareBufferImageAllocator::height() const
2166
{
2167
    return bufferDesc.height;
2168
}
2169

2170
uint64_t VkAndroidHardwareBufferImageAllocator::external_format() const
2171
{
2172
    return bufferFormatProperties.externalFormat;
2173
}
2174
#endif // __ANDROID_API__ >= 26
2175
#endif // NCNN_PLATFORM_API
2176

2177
#endif // NCNN_VULKAN
2178

2179
} // namespace ncnn
2180

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.