1
// Tencent is pleased to support the open source community by making ncnn available.
3
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
5
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6
// in compliance with the License. You may obtain a copy of the License at
8
// https://opensource.org/licenses/BSD-3-Clause
10
// Unless required by applicable law or agreed to in writing, software distributed
11
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
// specific language governing permissions and limitations under the License.
20
#if __ANDROID_API__ >= 26
21
#include <android/hardware_buffer.h>
22
#endif // __ANDROID_API__ >= 26
26
Allocator::~Allocator()
30
class PoolAllocatorPrivate
35
unsigned int size_compare_ratio; // 0~256
36
size_t size_drop_threshold;
37
std::list<std::pair<size_t, void*> > budgets;
38
std::list<std::pair<size_t, void*> > payouts;
41
PoolAllocator::PoolAllocator()
42
: Allocator(), d(new PoolAllocatorPrivate)
44
d->size_compare_ratio = 0;
45
d->size_drop_threshold = 10;
48
PoolAllocator::~PoolAllocator()
52
if (!d->payouts.empty())
54
NCNN_LOGE("FATAL ERROR! pool allocator destroyed too early");
56
std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
57
for (; it != d->payouts.end(); ++it)
59
void* ptr = it->second;
60
NCNN_LOGE("%p still in use", ptr);
68
PoolAllocator::PoolAllocator(const PoolAllocator&)
73
PoolAllocator& PoolAllocator::operator=(const PoolAllocator&)
78
void PoolAllocator::clear()
80
d->budgets_lock.lock();
82
std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
83
for (; it != d->budgets.end(); ++it)
85
void* ptr = it->second;
90
d->budgets_lock.unlock();
93
void PoolAllocator::set_size_compare_ratio(float scr)
95
if (scr < 0.f || scr > 1.f)
97
NCNN_LOGE("invalid size compare ratio %f", scr);
101
d->size_compare_ratio = (unsigned int)(scr * 256);
104
void PoolAllocator::set_size_drop_threshold(size_t threshold)
106
d->size_drop_threshold = threshold;
109
void* PoolAllocator::fastMalloc(size_t size)
111
d->budgets_lock.lock();
114
std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
115
for (; it != d->budgets.end(); ++it)
117
size_t bs = it->first;
119
// size_compare_ratio ~ 100%
120
if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
122
void* ptr = it->second;
124
d->budgets.erase(it);
126
d->budgets_lock.unlock();
128
d->payouts_lock.lock();
130
d->payouts.push_back(std::make_pair(bs, ptr));
132
d->payouts_lock.unlock();
137
if (bs < it_min->first)
141
if (bs > it_max->first)
147
if (d->budgets.size() >= d->size_drop_threshold)
149
// All chunks in pool are not chosen. Then try to drop some outdated
150
// chunks and return them to OS.
151
if (it_max->first < size)
153
// Current query is asking for a chunk larger than any cached chunks.
154
// Then remove the smallest one.
155
ncnn::fastFree(it_min->second);
156
d->budgets.erase(it_min);
158
else if (it_min->first > size)
160
// Current query is asking for a chunk smaller than any cached chunks.
161
// Then remove the largest one.
162
ncnn::fastFree(it_max->second);
163
d->budgets.erase(it_max);
167
d->budgets_lock.unlock();
170
void* ptr = ncnn::fastMalloc(size);
172
d->payouts_lock.lock();
174
d->payouts.push_back(std::make_pair(size, ptr));
176
d->payouts_lock.unlock();
181
void PoolAllocator::fastFree(void* ptr)
183
d->payouts_lock.lock();
186
std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
187
for (; it != d->payouts.end(); ++it)
189
if (it->second == ptr)
191
size_t size = it->first;
193
d->payouts.erase(it);
195
d->payouts_lock.unlock();
197
d->budgets_lock.lock();
199
d->budgets.push_back(std::make_pair(size, ptr));
201
d->budgets_lock.unlock();
207
d->payouts_lock.unlock();
209
NCNN_LOGE("FATAL ERROR! pool allocator get wild %p", ptr);
213
class UnlockedPoolAllocatorPrivate
216
unsigned int size_compare_ratio; // 0~256
217
size_t size_drop_threshold;
218
std::list<std::pair<size_t, void*> > budgets;
219
std::list<std::pair<size_t, void*> > payouts;
222
UnlockedPoolAllocator::UnlockedPoolAllocator()
223
: Allocator(), d(new UnlockedPoolAllocatorPrivate)
225
d->size_compare_ratio = 0;
226
d->size_drop_threshold = 10;
229
UnlockedPoolAllocator::~UnlockedPoolAllocator()
233
if (!d->payouts.empty())
235
NCNN_LOGE("FATAL ERROR! unlocked pool allocator destroyed too early");
237
std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
238
for (; it != d->payouts.end(); ++it)
240
void* ptr = it->second;
241
NCNN_LOGE("%p still in use", ptr);
249
UnlockedPoolAllocator::UnlockedPoolAllocator(const UnlockedPoolAllocator&)
254
UnlockedPoolAllocator& UnlockedPoolAllocator::operator=(const UnlockedPoolAllocator&)
259
void UnlockedPoolAllocator::clear()
261
std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
262
for (; it != d->budgets.end(); ++it)
264
void* ptr = it->second;
270
void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
272
if (scr < 0.f || scr > 1.f)
274
NCNN_LOGE("invalid size compare ratio %f", scr);
278
d->size_compare_ratio = (unsigned int)(scr * 256);
281
void UnlockedPoolAllocator::set_size_drop_threshold(size_t threshold)
283
d->size_drop_threshold = threshold;
286
void* UnlockedPoolAllocator::fastMalloc(size_t size)
289
std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
290
for (; it != d->budgets.end(); ++it)
292
size_t bs = it->first;
294
// size_compare_ratio ~ 100%
295
if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
297
void* ptr = it->second;
299
d->budgets.erase(it);
301
d->payouts.push_back(std::make_pair(bs, ptr));
306
if (bs > it_max->first)
310
if (bs < it_min->first)
316
if (d->budgets.size() >= d->size_drop_threshold)
318
if (it_max->first < size)
320
ncnn::fastFree(it_min->second);
321
d->budgets.erase(it_min);
323
else if (it_min->first > size)
325
ncnn::fastFree(it_max->second);
326
d->budgets.erase(it_max);
331
void* ptr = ncnn::fastMalloc(size);
333
d->payouts.push_back(std::make_pair(size, ptr));
338
void UnlockedPoolAllocator::fastFree(void* ptr)
341
std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
342
for (; it != d->payouts.end(); ++it)
344
if (it->second == ptr)
346
size_t size = it->first;
348
d->payouts.erase(it);
350
d->budgets.push_back(std::make_pair(size, ptr));
356
NCNN_LOGE("FATAL ERROR! unlocked pool allocator get wild %p", ptr);
361
VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
364
buffer_memory_type_index = (uint32_t)-1;
365
image_memory_type_index = (uint32_t)-1;
366
reserved_type_index = (uint32_t)-1;
371
VkAllocator::~VkAllocator()
376
void VkAllocator::clear()
380
static inline size_t round_up(size_t n, size_t multiple)
382
return (n + multiple - 1) / multiple * multiple;
385
static inline size_t round_down(size_t n, size_t multiple)
387
return n / multiple * multiple;
390
int VkAllocator::flush(VkBufferMemory* ptr)
395
VkMappedMemoryRange mappedMemoryRange;
396
mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
397
mappedMemoryRange.pNext = 0;
398
mappedMemoryRange.memory = ptr->memory;
399
mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
400
mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
402
VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
403
if (ret != VK_SUCCESS)
405
NCNN_LOGE("vkFlushMappedMemoryRanges failed %d", ret);
412
int VkAllocator::invalidate(VkBufferMemory* ptr)
417
VkMappedMemoryRange mappedMemoryRange;
418
mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
419
mappedMemoryRange.pNext = 0;
420
mappedMemoryRange.memory = ptr->memory;
421
mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
422
mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
424
VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
425
if (ret != VK_SUCCESS)
427
NCNN_LOGE("vkInvalidateMappedMemoryRanges failed %d", ret);
434
VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
436
VkBufferCreateInfo bufferCreateInfo;
437
bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
438
bufferCreateInfo.pNext = 0;
439
bufferCreateInfo.flags = 0;
440
bufferCreateInfo.size = size;
441
bufferCreateInfo.usage = usage;
442
bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
443
bufferCreateInfo.queueFamilyIndexCount = 0;
444
bufferCreateInfo.pQueueFamilyIndices = 0;
447
VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
448
if (ret != VK_SUCCESS)
450
NCNN_LOGE("vkCreateBuffer failed %d", ret);
457
VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
459
VkMemoryAllocateInfo memoryAllocateInfo;
460
memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
461
memoryAllocateInfo.pNext = 0;
462
memoryAllocateInfo.allocationSize = size;
463
memoryAllocateInfo.memoryTypeIndex = memory_type_index;
465
VkDeviceMemory memory = 0;
466
VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
467
if (ret != VK_SUCCESS)
469
NCNN_LOGE("vkAllocateMemory failed %d", ret);
476
VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
478
VkMemoryAllocateInfo memoryAllocateInfo;
479
memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
480
memoryAllocateInfo.pNext = 0;
481
memoryAllocateInfo.allocationSize = size;
482
memoryAllocateInfo.memoryTypeIndex = memory_type_index;
484
VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
485
memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
486
memoryDedicatedAllocateInfo.pNext = 0;
487
memoryDedicatedAllocateInfo.image = image;
488
memoryDedicatedAllocateInfo.buffer = buffer;
489
memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
491
VkDeviceMemory memory = 0;
492
VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
493
if (ret != VK_SUCCESS)
495
NCNN_LOGE("vkAllocateMemory failed %d", ret);
502
VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
504
VkImageCreateInfo imageCreateInfo;
505
imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
506
imageCreateInfo.pNext = 0;
507
imageCreateInfo.flags = 0;
508
imageCreateInfo.imageType = VK_IMAGE_TYPE_3D;
509
imageCreateInfo.format = format;
510
imageCreateInfo.extent.width = width;
511
imageCreateInfo.extent.height = height;
512
imageCreateInfo.extent.depth = depth;
513
imageCreateInfo.mipLevels = 1;
514
imageCreateInfo.arrayLayers = 1;
515
imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
516
imageCreateInfo.tiling = tiling;
517
imageCreateInfo.usage = usage;
518
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
519
imageCreateInfo.queueFamilyIndexCount = 0;
520
imageCreateInfo.pQueueFamilyIndices = 0;
521
imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
524
VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
525
if (ret != VK_SUCCESS)
527
NCNN_LOGE("vkCreateImage failed %d %d %d %d %d %d %d", ret, width, height, depth, format, tiling, usage);
534
VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
536
VkImageViewCreateInfo imageViewCreateInfo;
537
imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
538
imageViewCreateInfo.pNext = 0;
539
imageViewCreateInfo.flags = 0;
540
imageViewCreateInfo.image = image;
541
imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_3D;
542
imageViewCreateInfo.format = format;
543
imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
544
imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
545
imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
546
imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
547
imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
548
imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
549
imageViewCreateInfo.subresourceRange.levelCount = 1;
550
imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
551
imageViewCreateInfo.subresourceRange.layerCount = 1;
553
VkImageView imageview;
554
VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
555
if (ret != VK_SUCCESS)
557
NCNN_LOGE("vkCreateImageView failed %d", ret);
564
static inline size_t least_common_multiple(size_t a, size_t b)
570
return least_common_multiple(b, a);
581
class VkBlobAllocatorPrivate
585
size_t buffer_offset_alignment;
586
size_t bind_memory_offset_alignment;
587
std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
588
std::vector<VkBufferMemory*> buffer_blocks;
589
std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
590
std::vector<VkDeviceMemory> image_memory_blocks;
593
VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
594
: VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
596
d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
597
d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
599
if (vkdev->info.type() == 1)
601
// on integrated gpu, there may be device local only memory too, eg. AMD APU
602
// assuming larger alignment always keeps us safe :)
604
// least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
605
d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
606
d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
609
d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
612
VkBlobAllocator::~VkBlobAllocator()
619
VkBlobAllocator::VkBlobAllocator(const VkBlobAllocator&)
620
: VkAllocator(0), d(0)
624
VkBlobAllocator& VkBlobAllocator::operator=(const VkBlobAllocator&)
629
void VkBlobAllocator::clear()
631
// NCNN_LOGE("VkBlobAllocator %lu", buffer_blocks.size());
633
for (size_t i = 0; i < d->buffer_blocks.size(); i++)
635
VkBufferMemory* ptr = d->buffer_blocks[i];
637
// std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
638
// while (it != buffer_budgets[i].end())
640
// NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
645
vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
647
vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
648
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
652
d->buffer_blocks.clear();
654
d->buffer_budgets.clear();
656
for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
658
VkDeviceMemory memory = d->image_memory_blocks[i];
660
// std::list< std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
661
// while (it != d->image_memory_budgets[i].end())
663
// NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
667
vkFreeMemory(vkdev->vkdevice(), memory, 0);
669
d->image_memory_blocks.clear();
671
d->image_memory_budgets.clear();
674
VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
676
size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
678
const int buffer_block_count = d->buffer_blocks.size();
680
// find first spare space in buffer_blocks
681
for (int i = 0; i < buffer_block_count; i++)
683
std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[i].begin();
684
while (it != d->buffer_budgets[i].end())
686
size_t budget_size = it->second;
687
if (budget_size < aligned_size)
694
VkBufferMemory* ptr = new VkBufferMemory;
696
ptr->buffer = d->buffer_blocks[i]->buffer;
697
ptr->offset = it->first;
698
ptr->memory = d->buffer_blocks[i]->memory;
699
ptr->capacity = aligned_size;
700
ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
701
ptr->access_flags = 0;
702
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
704
// adjust buffer_budgets
705
if (budget_size == aligned_size)
707
d->buffer_budgets[i].erase(it);
711
it->first += aligned_size;
712
it->second -= aligned_size;
715
// NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
721
size_t new_block_size = std::max(d->block_size, aligned_size);
724
VkBufferMemory* block = new VkBufferMemory;
726
block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
729
// TODO respect VK_KHR_dedicated_allocation ?
731
VkMemoryRequirements memoryRequirements;
732
vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
734
// setup memory type and alignment
735
if (buffer_memory_type_index == (uint32_t)-1)
737
if (vkdev->info.type() == 1)
739
// integrated gpu, prefer unified memory
740
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
742
// on amd integrated gpu, there is a faster and larger device-only heap
743
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
744
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
745
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
746
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
747
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
749
buffer_memory_type_index = device_local_memory_type_index;
754
// discrete gpu, device local
755
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
758
mappable = vkdev->is_mappable(buffer_memory_type_index);
759
coherent = vkdev->is_coherent(buffer_memory_type_index);
762
block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
764
// ignore memoryRequirements.alignment as we always bind at zero offset
765
vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
767
block->mapped_ptr = 0;
770
vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
773
d->buffer_blocks.push_back(block);
776
VkBufferMemory* ptr = new VkBufferMemory;
778
ptr->buffer = block->buffer;
780
ptr->memory = block->memory;
781
ptr->capacity = aligned_size;
782
ptr->mapped_ptr = block->mapped_ptr;
783
ptr->access_flags = 0;
784
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
786
// adjust buffer_budgets
787
std::list<std::pair<size_t, size_t> > budget;
788
if (new_block_size > aligned_size)
790
budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
792
d->buffer_budgets.push_back(budget);
794
// NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
799
void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
801
// NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
803
const int buffer_block_count = d->buffer_blocks.size();
805
int block_index = -1;
806
for (int i = 0; i < buffer_block_count; i++)
808
if (d->buffer_blocks[i]->buffer == ptr->buffer && d->buffer_blocks[i]->memory == ptr->memory)
815
if (block_index == -1)
817
NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);
825
std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->buffer_budgets[block_index].end();
826
std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->buffer_budgets[block_index].end();
827
std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[block_index].begin();
828
for (; it != d->buffer_budgets[block_index].end(); it++)
830
if (it->first + it->second == ptr->offset)
834
else if (ptr->offset + ptr->capacity == it->first)
840
if (it_merge_left != d->buffer_budgets[block_index].end() && it_merge_right != d->buffer_budgets[block_index].end())
842
it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
843
d->buffer_budgets[block_index].erase(it_merge_right);
845
else if (it_merge_left != d->buffer_budgets[block_index].end())
847
it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
849
else if (it_merge_right != d->buffer_budgets[block_index].end())
851
it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
852
it_merge_right->first = ptr->offset;
856
if (ptr->offset == 0)
858
// chain leading block
859
d->buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
863
d->buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
870
VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
872
if (elempack != 1 && elempack != 4 && elempack != 8)
874
NCNN_LOGE("elempack must be 1 4 8");
879
VkFormat format = VK_FORMAT_UNDEFINED;
881
if (elemsize / elempack == 4)
884
if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
885
if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
886
if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
888
if (elemsize / elempack == 2)
891
if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
892
if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
893
if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
896
// resolve image width height depth
901
// large elempack spills on image w
902
if (elempack == 8) width *= 2;
904
if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
906
NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
910
VkImageMemory* ptr = new VkImageMemory;
912
ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
915
ptr->height = height;
917
ptr->format = format;
919
// TODO respect VK_KHR_dedicated_allocation ?
920
VkMemoryRequirements memoryRequirements;
921
vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
923
const size_t size = memoryRequirements.size;
924
const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
926
size_t aligned_size = alignSize(size, alignment);
928
const int image_memory_block_count = d->image_memory_blocks.size();
930
// find first spare space in image_memory_blocks
931
for (int i = 0; i < image_memory_block_count; i++)
934
// HACK moltenvk v1.2.3 is unhappy for image binding with offset :(
938
std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
939
while (it != d->image_memory_budgets[i].end())
941
// we cannot use it->first directly for base offset alignment
942
size_t bind_base_offset = it->first;
943
size_t bind_offset = alignSize(bind_base_offset, alignment);
944
size_t budget_size = it->second;
945
if (budget_size < aligned_size + (bind_offset - bind_base_offset))
951
// bind at memory offset
952
ptr->memory = d->image_memory_blocks[i];
953
ptr->bind_offset = bind_offset;
954
ptr->bind_capacity = aligned_size;
956
vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
958
// do not allow host access to optimal tiling image
961
ptr->imageview = create_imageview(ptr->image, format);
963
ptr->access_flags = 0;
964
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
965
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
966
ptr->command_refcount = 0;
968
if (bind_base_offset != bind_offset)
970
// NOTE there is small offset inside bind_base_offset and bind_offset
971
// adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
972
// so that memory management could be easier
973
aligned_size += (bind_offset - bind_base_offset);
975
ptr->bind_offset = bind_base_offset;
976
ptr->bind_capacity = aligned_size;
979
// adjust image_memory_budgets
980
if (budget_size == aligned_size)
982
d->image_memory_budgets[i].erase(it);
986
it->first += aligned_size;
987
it->second -= aligned_size;
990
// NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
996
// setup memory type and alignment
997
if (image_memory_type_index == (uint32_t)-1)
999
if (vkdev->info.type() == 1)
1001
// integrated gpu, prefer unified memory
1002
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1004
// on amd integrated gpu, there is a faster and larger device-only heap
1005
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1006
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1007
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1008
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1009
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1011
image_memory_type_index = device_local_memory_type_index;
1016
// discrete gpu, device local
1017
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1020
mappable = vkdev->is_mappable(image_memory_type_index);
1021
coherent = vkdev->is_coherent(image_memory_type_index);
1025
size_t new_block_size = std::max(d->block_size, aligned_size);
1028
// HACK moltenvk v1.2.3 is unhappy for image binding with offset
1029
// always ignore block size for smaller memory footprint :(
1030
new_block_size = aligned_size;
1033
// bind at memory offset
1034
ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1035
ptr->bind_offset = 0;
1036
ptr->bind_capacity = aligned_size;
1038
// ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1039
vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1041
// do not allow host access to optimal tiling image
1042
ptr->mapped_ptr = 0;
1044
ptr->imageview = create_imageview(ptr->image, format);
1046
ptr->access_flags = 0;
1047
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1048
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1049
ptr->command_refcount = 0;
1051
// adjust image_memory_budgets
1052
d->image_memory_blocks.push_back(ptr->memory);
1054
std::list<std::pair<size_t, size_t> > budget;
1055
if (new_block_size > aligned_size)
1057
budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
1059
d->image_memory_budgets.push_back(budget);
1061
// NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
1066
void VkBlobAllocator::fastFree(VkImageMemory* ptr)
1068
// NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
1070
const int image_memory_block_count = d->image_memory_blocks.size();
1072
int block_index = -1;
1073
for (int i = 0; i < image_memory_block_count; i++)
1075
if (d->image_memory_blocks[i] == ptr->memory)
1082
if (block_index == -1)
1084
NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->memory);
1086
if (!ptr->command_refcount)
1088
vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1089
vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1098
std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->image_memory_budgets[block_index].end();
1099
std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->image_memory_budgets[block_index].end();
1100
std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[block_index].begin();
1101
for (; it != d->image_memory_budgets[block_index].end(); it++)
1103
if (it->first + it->second == ptr->bind_offset)
1107
else if (ptr->bind_offset + ptr->bind_capacity == it->first)
1109
it_merge_right = it;
1113
if (it_merge_left != d->image_memory_budgets[block_index].end() && it_merge_right != d->image_memory_budgets[block_index].end())
1115
it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
1116
d->image_memory_budgets[block_index].erase(it_merge_right);
1118
else if (it_merge_left != d->image_memory_budgets[block_index].end())
1120
it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
1122
else if (it_merge_right != d->image_memory_budgets[block_index].end())
1124
it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
1125
it_merge_right->first = ptr->bind_offset;
1129
if (ptr->bind_offset == 0)
1131
// chain leading block
1132
d->image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1136
d->image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1140
if (!ptr->command_refcount)
1142
vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1143
vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1149
class VkWeightAllocatorPrivate
1153
size_t buffer_offset_alignment;
1154
size_t bind_memory_offset_alignment;
1155
std::vector<size_t> buffer_block_free_spaces;
1156
std::vector<VkBufferMemory*> buffer_blocks;
1157
std::vector<VkBufferMemory*> dedicated_buffer_blocks;
1158
std::vector<size_t> image_memory_block_free_spaces;
1159
std::vector<VkDeviceMemory> image_memory_blocks;
1160
std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
1163
VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
1164
: VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
1166
d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
1167
d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
1169
if (vkdev->info.type() == 1)
1171
// on integrated gpu, there may be device local only memory too, eg. AMD APU
1172
// assuming larger alignment always keeps us safe :)
1174
// least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
1175
d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
1176
d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
1179
d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
1182
VkWeightAllocator::~VkWeightAllocator()
1189
VkWeightAllocator::VkWeightAllocator(const VkWeightAllocator&)
1190
: VkAllocator(0), d(0)
1194
VkWeightAllocator& VkWeightAllocator::operator=(const VkWeightAllocator&)
1199
void VkWeightAllocator::clear()
1201
// NCNN_LOGE("VkWeightAllocator %lu %lu", d->buffer_blocks.size(), d->dedicated_buffer_blocks.size());
1203
d->buffer_block_free_spaces.clear();
1205
for (size_t i = 0; i < d->buffer_blocks.size(); i++)
1207
VkBufferMemory* ptr = d->buffer_blocks[i];
1210
vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1212
vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1213
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1217
d->buffer_blocks.clear();
1219
for (size_t i = 0; i < d->dedicated_buffer_blocks.size(); i++)
1221
VkBufferMemory* ptr = d->dedicated_buffer_blocks[i];
1224
vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1226
vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1227
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1231
d->dedicated_buffer_blocks.clear();
1233
d->image_memory_block_free_spaces.clear();
1235
for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
1237
VkDeviceMemory memory = d->image_memory_blocks[i];
1239
vkFreeMemory(vkdev->vkdevice(), memory, 0);
1241
d->image_memory_blocks.clear();
1243
for (size_t i = 0; i < d->dedicated_image_memory_blocks.size(); i++)
1245
VkDeviceMemory memory = d->dedicated_image_memory_blocks[i];
1247
vkFreeMemory(vkdev->vkdevice(), memory, 0);
1249
d->dedicated_image_memory_blocks.clear();
1252
VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
1254
// NCNN_LOGE("VkWeightAllocator fastMalloc %lu", size);
1256
size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
1258
const int buffer_block_count = d->buffer_blocks.size();
1260
// find first spare space in buffer_blocks
1261
for (int i = 0; i < buffer_block_count; i++)
1263
size_t free_size = d->buffer_block_free_spaces[i];
1264
if (free_size >= aligned_size)
1266
size_t block_offset = d->block_size - free_size;
1268
// return sub buffer
1269
VkBufferMemory* ptr = new VkBufferMemory;
1271
ptr->buffer = d->buffer_blocks[i]->buffer;
1272
ptr->offset = block_offset;
1273
ptr->memory = d->buffer_blocks[i]->memory;
1274
ptr->capacity = aligned_size;
1275
ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
1276
ptr->access_flags = 0;
1277
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1279
d->buffer_block_free_spaces[i] -= aligned_size;
1285
size_t new_block_size = std::max(d->block_size, aligned_size);
1288
VkBufferMemory* block = new VkBufferMemory;
1290
block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1293
if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1295
VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
1296
bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
1297
bufferMemoryRequirementsInfo2.pNext = 0;
1298
bufferMemoryRequirementsInfo2.buffer = block->buffer;
1300
VkMemoryRequirements2KHR memoryRequirements2;
1301
memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1302
memoryRequirements2.pNext = 0;
1304
VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1305
memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1306
memoryDedicatedRequirements.pNext = 0;
1307
memoryRequirements2.pNext = &memoryDedicatedRequirements;
1309
vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
1311
bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1313
if (dedicatedAllocation)
1315
// setup memory type and alignment
1316
if (buffer_memory_type_index == (uint32_t)-1)
1318
if (vkdev->info.type() == 1)
1320
// integrated gpu, prefer unified memory
1321
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1323
// on amd integrated gpu, there is a faster and larger device-only heap
1324
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1325
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1326
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
1327
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1328
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1330
buffer_memory_type_index = device_local_memory_type_index;
1335
// discrete gpu, device local
1336
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1339
mappable = vkdev->is_mappable(buffer_memory_type_index);
1340
coherent = vkdev->is_coherent(buffer_memory_type_index);
1343
block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
1345
// ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1346
vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1348
block->mapped_ptr = 0;
1351
vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1354
d->dedicated_buffer_blocks.push_back(block);
1356
// return sub buffer
1357
VkBufferMemory* ptr = new VkBufferMemory;
1359
ptr->buffer = block->buffer;
1361
ptr->memory = block->memory;
1362
ptr->capacity = new_block_size;
1363
ptr->mapped_ptr = block->mapped_ptr;
1364
ptr->access_flags = 0;
1365
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1371
VkMemoryRequirements memoryRequirements;
1372
vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
1374
// setup memory type and alignment
1375
if (buffer_memory_type_index == (uint32_t)-1)
1377
if (vkdev->info.type() == 1)
1379
// integrated gpu, prefer unified memory
1380
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1382
// on amd integrated gpu, there is a faster and larger device-only heap
1383
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1384
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1385
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
1386
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1387
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1389
buffer_memory_type_index = device_local_memory_type_index;
1394
// discrete gpu, device local
1395
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1398
mappable = vkdev->is_mappable(buffer_memory_type_index);
1399
coherent = vkdev->is_coherent(buffer_memory_type_index);
1402
block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1404
// ignore memoryRequirements.alignment as we always bind at zero offset
1405
vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1407
// NCNN_LOGE("VkWeightAllocator M %p", block->buffer);
1409
block->mapped_ptr = 0;
1412
vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1415
d->buffer_blocks.push_back(block);
1417
d->buffer_block_free_spaces.push_back(new_block_size - aligned_size);
1419
// return sub buffer
1420
VkBufferMemory* ptr = new VkBufferMemory;
1422
ptr->buffer = block->buffer;
1424
ptr->memory = block->memory;
1425
ptr->capacity = aligned_size;
1426
ptr->mapped_ptr = block->mapped_ptr;
1427
ptr->access_flags = 0;
1428
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1433
void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
1435
// NCNN_LOGE("VkWeightAllocator F %p", ptr->buffer);
1440
VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
1442
if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
1444
NCNN_LOGE("elempack must be 1 4 8 16 32 64");
1449
VkFormat format = VK_FORMAT_UNDEFINED;
1451
if (elemsize / elempack == 4)
1454
if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
1455
if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1456
if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1457
if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1458
if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1459
if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1461
if (elemsize / elempack == 2)
1464
if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
1465
if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1466
if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1467
if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1468
if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1469
if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1472
// resolve image width height depth
1477
// large elempack spills on image w
1478
if (elempack == 8) width *= 2;
1479
if (elempack == 16) width *= 4;
1480
if (elempack == 32) width *= 8;
1481
if (elempack == 64) width *= 16;
1483
if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
1485
NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
1489
VkImageMemory* ptr = new VkImageMemory;
1491
ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
1494
ptr->height = height;
1496
ptr->format = format;
1498
if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1500
VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
1501
imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
1502
imageMemoryRequirementsInfo2.pNext = 0;
1503
imageMemoryRequirementsInfo2.image = ptr->image;
1505
VkMemoryRequirements2KHR memoryRequirements2;
1506
memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1507
memoryRequirements2.pNext = 0;
1509
VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1510
memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1511
memoryDedicatedRequirements.pNext = 0;
1512
memoryRequirements2.pNext = &memoryDedicatedRequirements;
1514
vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);
1516
bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1518
if (dedicatedAllocation)
1520
// setup memory type and alignment
1521
if (image_memory_type_index == (uint32_t)-1)
1523
if (vkdev->info.type() == 1)
1525
// integrated gpu, prefer unified memory
1526
image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1528
// on amd integrated gpu, there is a faster and larger device-only heap
1529
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1530
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1531
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1532
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1533
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1535
image_memory_type_index = device_local_memory_type_index;
1540
// discrete gpu, device local
1541
image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1544
mappable = vkdev->is_mappable(image_memory_type_index);
1545
coherent = vkdev->is_coherent(image_memory_type_index);
1549
ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
1550
ptr->bind_offset = 0;
1551
ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;
1553
// ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1554
vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1556
// do not allow host access to optimal tiling image
1557
ptr->mapped_ptr = 0;
1559
ptr->imageview = create_imageview(ptr->image, format);
1561
ptr->access_flags = 0;
1562
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1563
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1564
ptr->command_refcount = 0;
1566
d->dedicated_image_memory_blocks.push_back(ptr->memory);
1572
VkMemoryRequirements memoryRequirements;
1573
vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
1575
const size_t size = memoryRequirements.size;
1576
const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
1578
size_t aligned_size = alignSize(size, alignment);
1580
const int image_memory_block_count = d->image_memory_blocks.size();
1582
// find first spare space in buffer_blocks
1583
for (int i = 0; i < image_memory_block_count; i++)
1585
// we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
1586
size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i];
1587
size_t bind_offset = alignSize(bind_base_offset, alignment);
1588
if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
1590
// bind at memory offset
1591
ptr->memory = d->image_memory_blocks[i];
1592
ptr->bind_offset = bind_offset;
1593
ptr->bind_capacity = aligned_size;
1595
vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1597
// do not allow host access to optimal tiling image
1598
ptr->mapped_ptr = 0;
1600
ptr->imageview = create_imageview(ptr->image, format);
1602
ptr->access_flags = 0;
1603
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1604
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1605
ptr->command_refcount = 0;
1607
if (bind_base_offset != bind_offset)
1609
// NOTE there is small offset inside bind_base_offset and bind_offset
1610
// adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
1611
// so that memory management could be easier
1612
aligned_size += (bind_offset - bind_base_offset);
1614
ptr->bind_offset = bind_base_offset;
1615
ptr->bind_capacity = aligned_size;
1618
d->image_memory_block_free_spaces[i] -= aligned_size;
1624
// setup memory type and alignment
1625
if (image_memory_type_index == (uint32_t)-1)
1627
if (vkdev->info.type() == 1)
1629
// integrated gpu, prefer unified memory
1630
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1632
// on amd integrated gpu, there is a faster and larger device-only heap
1633
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1634
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
1635
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
1636
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
1637
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
1639
image_memory_type_index = device_local_memory_type_index;
1644
// discrete gpu, device local
1645
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1648
mappable = vkdev->is_mappable(image_memory_type_index);
1649
coherent = vkdev->is_coherent(image_memory_type_index);
1653
size_t new_block_size = std::max(d->block_size, aligned_size);
1655
// bind at memory offset
1656
ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1657
ptr->bind_offset = 0;
1658
ptr->bind_capacity = aligned_size;
1660
// ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1661
vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1663
// do not allow host access to optimal tiling image
1664
ptr->mapped_ptr = 0;
1666
ptr->imageview = create_imageview(ptr->image, format);
1668
ptr->access_flags = 0;
1669
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1670
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1671
ptr->command_refcount = 0;
1673
d->image_memory_blocks.push_back(ptr->memory);
1674
d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size);
1679
void VkWeightAllocator::fastFree(VkImageMemory* ptr)
1681
// NCNN_LOGE("VkWeightAllocator F %p", ptr->memory);
1683
if (!ptr->command_refcount)
1685
vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1686
vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1692
class VkStagingAllocatorPrivate
1695
unsigned int size_compare_ratio; // 0~256
1696
std::list<VkBufferMemory*> buffer_budgets;
1699
VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev)
1700
: VkAllocator(_vkdev), d(new VkStagingAllocatorPrivate)
1705
d->size_compare_ratio = 192; // 0.75f * 256
1708
VkStagingAllocator::~VkStagingAllocator()
1715
VkStagingAllocator::VkStagingAllocator(const VkStagingAllocator&)
1716
: VkAllocator(0), d(0)
1720
VkStagingAllocator& VkStagingAllocator::operator=(const VkStagingAllocator&)
1725
void VkStagingAllocator::set_size_compare_ratio(float scr)
1727
if (scr < 0.f || scr > 1.f)
1729
NCNN_LOGE("invalid size compare ratio %f", scr);
1733
d->size_compare_ratio = (unsigned int)(scr * 256);
1736
void VkStagingAllocator::clear()
1738
// NCNN_LOGE("VkStagingAllocator %lu", buffer_budgets.size());
1740
for (std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin(); it != d->buffer_budgets.end(); it++)
1742
VkBufferMemory* ptr = *it;
1744
// NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1746
vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1747
vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1748
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1752
d->buffer_budgets.clear();
1755
VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
1758
std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin();
1759
for (; it != d->buffer_budgets.end(); it++)
1761
VkBufferMemory* ptr = *it;
1763
size_t capacity = ptr->capacity;
1765
// size_compare_ratio ~ 100%
1766
if (capacity >= size && ((capacity * d->size_compare_ratio) >> 8) <= size)
1768
d->buffer_budgets.erase(it);
1770
// NCNN_LOGE("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
1776
VkBufferMemory* ptr = new VkBufferMemory;
1778
ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1781
VkMemoryRequirements memoryRequirements;
1782
vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1784
// setup memory type
1785
if (buffer_memory_type_index == (uint32_t)-1)
1787
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1790
ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1792
// ignore memoryRequirements.alignment as we always bind at zero offset
1793
vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1795
ptr->capacity = size;
1797
vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1799
ptr->access_flags = 0;
1800
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1802
// NCNN_LOGE("VkStagingAllocator M %p %lu", ptr->buffer, size);
1807
void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
1809
// NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1811
// return to buffer_budgets
1812
d->buffer_budgets.push_back(ptr);
1815
VkImageMemory* VkStagingAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int /* elempack */)
1817
// staging image is mainly used for storing small piece of dynamic parameters
1818
// we allocate host memory as a fake image, it's simple and good
1820
const size_t size = w * h * c * elemsize;
1822
VkImageMemory* ptr = new VkImageMemory;
1828
ptr->format = VK_FORMAT_UNDEFINED;
1830
ptr->bind_offset = 0;
1831
ptr->bind_capacity = size;
1833
ptr->mapped_ptr = malloc(size);
1837
ptr->access_flags = 0;
1838
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1839
ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
1840
ptr->command_refcount = 0;
1842
// NCNN_LOGE("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
1847
void VkStagingAllocator::fastFree(VkImageMemory* ptr)
1849
// NCNN_LOGE("VkStagingAllocator F %p", ptr->image);
1851
free(ptr->mapped_ptr);
1856
class VkWeightStagingAllocatorPrivate
1861
VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev)
1862
: VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
1868
VkWeightStagingAllocator::~VkWeightStagingAllocator()
1873
VkWeightStagingAllocator::VkWeightStagingAllocator(const VkWeightStagingAllocator&)
1874
: VkAllocator(0), d(0)
1878
VkWeightStagingAllocator& VkWeightStagingAllocator::operator=(const VkWeightStagingAllocator&)
1883
VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
1885
VkBufferMemory* ptr = new VkBufferMemory;
1887
ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1890
VkMemoryRequirements memoryRequirements;
1891
vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1893
// setup memory type
1894
if (buffer_memory_type_index == (uint32_t)-1)
1896
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1899
ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1901
// ignore memoryRequirements.alignment as we always bind at zero offset
1902
vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1904
ptr->capacity = size;
1906
vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1908
ptr->access_flags = 0;
1909
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1911
// NCNN_LOGE("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
1916
void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
1918
// NCNN_LOGE("VkWeightStagingAllocator F %p", ptr->buffer);
1920
vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1921
vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1922
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1927
VkImageMemory* VkWeightStagingAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1932
void VkWeightStagingAllocator::fastFree(VkImageMemory* /*ptr*/)
1936
#if NCNN_PLATFORM_API
1937
#if __ANDROID_API__ >= 26
1938
VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb)
1939
: VkAllocator(_vkdev), hb(_hb)
1941
samplerYcbcrConversion = 0;
1946
VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator()
1948
if (samplerYcbcrConversion)
1950
vkdev->vkDestroySamplerYcbcrConversionKHR(vkdev->vkdevice(), samplerYcbcrConversion, 0);
1951
samplerYcbcrConversion = 0;
1955
VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&)
1960
VkAndroidHardwareBufferImageAllocator& VkAndroidHardwareBufferImageAllocator::operator=(const VkAndroidHardwareBufferImageAllocator&)
1965
VkBufferMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(size_t /*size*/)
1970
void VkAndroidHardwareBufferImageAllocator::fastFree(VkBufferMemory* /*ptr*/)
1974
VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1978
VkExternalFormatANDROID externalFormat;
1979
externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1980
externalFormat.pNext = 0;
1981
externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1983
VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo;
1984
externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
1985
externalMemoryImageCreateInfo.pNext = &externalFormat,
1986
externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
1988
VkImageCreateInfo imageCreateInfo;
1989
imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1990
imageCreateInfo.pNext = &externalMemoryImageCreateInfo;
1991
imageCreateInfo.flags = 0;
1992
imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
1993
imageCreateInfo.format = VK_FORMAT_UNDEFINED;
1994
imageCreateInfo.extent.width = bufferDesc.width;
1995
imageCreateInfo.extent.height = bufferDesc.height;
1996
imageCreateInfo.extent.depth = 1;
1997
imageCreateInfo.mipLevels = 1;
1998
imageCreateInfo.arrayLayers = 1;
1999
imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
2000
imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
2001
imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
2002
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
2003
imageCreateInfo.queueFamilyIndexCount = 0;
2004
imageCreateInfo.pQueueFamilyIndices = 0;
2005
imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
2008
ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
2009
if (ret != VK_SUCCESS)
2011
NCNN_LOGE("vkCreateImage failed %d", ret);
2015
// setup memory type
2016
if (image_memory_type_index == (uint32_t)-1)
2018
image_memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
2021
VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo;
2022
importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
2023
importAndroidHardwareBufferInfo.pNext = 0;
2024
importAndroidHardwareBufferInfo.buffer = hb;
2026
VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo;
2027
memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
2028
memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo;
2029
memoryDedicatedAllocateInfo.image = image;
2030
memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
2032
VkMemoryAllocateInfo memoryAllocateInfo;
2033
memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
2034
memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
2035
memoryAllocateInfo.allocationSize = bufferProperties.allocationSize;
2036
memoryAllocateInfo.memoryTypeIndex = image_memory_type_index;
2038
VkDeviceMemory memory = 0;
2039
ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
2040
if (ret != VK_SUCCESS)
2042
NCNN_LOGE("vkAllocateMemory failed %d", ret);
2046
VkBindImageMemoryInfo bindImageMemoryInfo;
2047
bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
2048
bindImageMemoryInfo.pNext = 0;
2049
bindImageMemoryInfo.image = image;
2050
bindImageMemoryInfo.memory = memory;
2051
bindImageMemoryInfo.memoryOffset = 0;
2052
ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo);
2053
if (ret != VK_SUCCESS)
2055
NCNN_LOGE("vkBindImageMemory2KHR failed %d", ret);
2056
vkDestroyImage(vkdev->vkdevice(), image, 0);
2060
VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo;
2061
samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR;
2062
samplerYcbcrConversionInfo.pNext = &externalFormat;
2063
samplerYcbcrConversionInfo.conversion = samplerYcbcrConversion;
2065
VkImageViewCreateInfo imageViewCreateInfo;
2066
imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
2067
imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo;
2068
imageViewCreateInfo.flags = 0;
2069
imageViewCreateInfo.image = image;
2070
imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
2071
imageViewCreateInfo.format = VK_FORMAT_UNDEFINED;
2072
imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
2073
imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
2074
imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
2075
imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
2076
imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
2077
imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
2078
imageViewCreateInfo.subresourceRange.levelCount = 1;
2079
imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
2080
imageViewCreateInfo.subresourceRange.layerCount = 1;
2082
VkImageView imageview = 0;
2083
ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
2084
if (ret != VK_SUCCESS)
2086
NCNN_LOGE("vkCreateImageView failed %d", ret);
2087
vkDestroyImage(vkdev->vkdevice(), image, 0);
2088
vkFreeMemory(vkdev->vkdevice(), memory, 0);
2092
VkImageMemory* ptr = new VkImageMemory;
2094
ptr->memory = memory;
2095
ptr->imageview = imageview;
2096
ptr->access_flags = 0;
2097
ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
2098
ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
2103
void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr)
2105
vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
2106
vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
2107
vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
2112
int VkAndroidHardwareBufferImageAllocator::init()
2114
AHardwareBuffer_describe(hb, &bufferDesc);
2118
// resolve externalFormat
2119
bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID;
2120
bufferFormatProperties.pNext = 0;
2122
bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
2123
bufferProperties.pNext = &bufferFormatProperties;
2125
ret = vkdev->vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties);
2126
if (ret != VK_SUCCESS)
2128
NCNN_LOGE("vkGetAndroidHardwareBufferPropertiesANDROID failed %d", ret);
2132
// setup samplerYcbcrConversion
2133
VkExternalFormatANDROID externalFormat;
2134
externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
2135
externalFormat.pNext = 0;
2136
externalFormat.externalFormat = bufferFormatProperties.externalFormat;
2138
VkSamplerYcbcrConversionCreateInfoKHR samplerYcbcrConversionCreateInfo;
2139
samplerYcbcrConversionCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR;
2140
samplerYcbcrConversionCreateInfo.pNext = &externalFormat;
2141
samplerYcbcrConversionCreateInfo.format = VK_FORMAT_UNDEFINED;
2142
samplerYcbcrConversionCreateInfo.ycbcrModel = bufferFormatProperties.suggestedYcbcrModel;
2143
samplerYcbcrConversionCreateInfo.ycbcrRange = bufferFormatProperties.suggestedYcbcrRange;
2144
samplerYcbcrConversionCreateInfo.components = bufferFormatProperties.samplerYcbcrConversionComponents;
2145
samplerYcbcrConversionCreateInfo.xChromaOffset = bufferFormatProperties.suggestedXChromaOffset;
2146
samplerYcbcrConversionCreateInfo.yChromaOffset = bufferFormatProperties.suggestedYChromaOffset;
2147
samplerYcbcrConversionCreateInfo.chromaFilter = VK_FILTER_NEAREST;
2148
samplerYcbcrConversionCreateInfo.forceExplicitReconstruction = VK_FALSE;
2150
ret = vkdev->vkCreateSamplerYcbcrConversionKHR(vkdev->vkdevice(), &samplerYcbcrConversionCreateInfo, 0, &samplerYcbcrConversion);
2151
if (ret != VK_SUCCESS)
2153
NCNN_LOGE("vkCreateSamplerYcbcrConversionKHR failed %d", ret);
2160
int VkAndroidHardwareBufferImageAllocator::width() const
2162
return bufferDesc.width;
2165
int VkAndroidHardwareBufferImageAllocator::height() const
2167
return bufferDesc.height;
2170
uint64_t VkAndroidHardwareBufferImageAllocator::external_format() const
2172
return bufferFormatProperties.externalFormat;
2174
#endif // __ANDROID_API__ >= 26
2175
#endif // NCNN_PLATFORM_API
2177
#endif // NCNN_VULKAN