I am trying to implement multi threaded command buffer generation (using per-thread command pool and secondary command buffers), but there are little performance gain of using multiple threads.
First, I thought that my thread pool code was incorrectly written, but I tried Sascha Willems's thread pool implementation, and nothing changed (so I don't think that's an issue)
Second, I searched for multi threading performance issues and I found that accessing same variables/resources from different thread causes performance drop, but still i can't figure out the problem in my case.
I also downloaded Sascha Willems's multi threading code, run it, and it worked just fine. I modified the number of working threads and the performance gain using multiple threads is clearly visible.
Here are some FPS results for rendering 600 objects (same model). You can see what my problem is:
core count Sascha Willems's my result
result ( avg. FPS) (avg. FPS)
1 45 30
2 83 33
4 110 40
6 155 42
8 162 42
10 173 40
12 175 40
This is where i prepare the thread data
void prepareThreadData
{
primaryCommandPool = m_device.createCommandPool (
vk::CommandPoolCreateInfo (
vk::CommandPoolCreateFlags(vk::CommandPoolCreateFlagBits::eResetCommandBuffer),
graphicsQueueIdx
)
);
primaryCommandBuffer = m_device.allocateCommandBuffers (
vk::CommandBufferAllocateInfo (
primaryCommandPool,
vk::CommandBufferLevel::ePrimary,
1
)
)[0];
threadData.resize(numberOfThreads);
for (int i = 0; i < numberOfThreads; ++i)
{
threadData[i].commandPool = m_device.createCommandPool (
vk::CommandPoolCreateInfo (
vk::CommandPoolCreateFlags(vk::CommandPoolCreateFlagBits::eResetCommandBuffer),
graphicsQueueIdx
)
);
threadData[i].commandBuffer = m_device.allocateCommandBuffers (
vk::CommandBufferAllocateInfo (
threadData[i].commandPool,
vk::CommandBufferLevel::eSecondary,
numberOfObjectsPerThread
)
);
for (int j = 0; j < numberOfObjectsPerThread; ++j)
{
VertexPushConstant pushConstant = { someRandomPosition()};
threadData[i].pushConstBlock.push_back(pushConstant);
}
}
}
Here is my render loop code where i give job for each thread:
while (!display.IsWindowClosed())
{
display.PollEvents();
m_device.acquireNextImageKHR(m_swapChain, std::numeric_limits<uint64_t>::max(), presentCompleteSemaphore, nullptr, ¤tBuffer);
primaryCommandBuffer.begin(vk::CommandBufferBeginInfo());
primaryCommandBuffer.beginRenderPass(
vk::RenderPassBeginInfo(m_renderPass, m_swapChainBuffers[currentBuffer].frameBuffer, m_renderArea, clearValues.size(), clearValues.data()),
vk::SubpassContents::eSecondaryCommandBuffers);
vk::CommandBufferInheritanceInfo inheritanceInfo = {};
inheritanceInfo.renderPass = m_renderPass;
inheritanceInfo.framebuffer = m_swapChainBuffers[currentBuffer].frameBuffer;
for (int t = 0; t < numberOfThreads; ++t)
{
for (int i = 0; i < numberOfObjectsPerThread; ++i)
{
threadPool.threads[t]->addJob([=]
{
std::array<vk::DeviceSize, 1> offsets = { 0 };
vk::Viewport viewport = vk::Viewport(0.0f, 0.0f, WIDTH, HEIGHT, 0.0f, 1.0f);
vk::Rect2D renderArea = vk::Rect2D(vk::Offset2D(), vk::Extent2D(WIDTH, HEIGHT));
threadData[t].commandBuffer[i].begin(vk::CommandBufferBeginInfo(vk::CommandBufferUsageFlagBits::eRenderPassContinue, &inheritanceInfo));
threadData[t].commandBuffer[i].setViewport(0, viewport);
threadData[t].commandBuffer[i].setScissor(0, renderArea);
threadData[t].commandBuffer[i].bindPipeline(vk::PipelineBindPoint::eGraphics, m_graphicsPipeline);
threadData[t].commandBuffer[i].bindVertexBuffers(VERTEX_BUFFER_BIND, 1, &model.vertexBuffer, offsets.data());
threadData[t].commandBuffer[i].bindIndexBuffer(model.indexBuffer, 0, vk::IndexType::eUint32);
threadData[t].commandBuffer[i].pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eVertex, 0, sizeof(VertexPushConstant), &threadData[t].pushConstBlock[i]);
threadData[t].commandBuffer[i].drawIndexed(model.indexCount, 1, 0, 0, 0);
threadData[t].commandBuffer[i].end();
});
}
}
threadPool.wait();
std::vector<vk::CommandBuffer> commandBuffers;
for (int t = 0; t < numberOfThreads; ++t)
{
for (int i = 0; i < numberOfObjectsPerThread; ++i)
{
commandBuffers.push_back(threadData[t].commandBuffer[i]);
}
}
primaryCommandBuffer.executeCommands(commandBuffers.size(), commandBuffers.data());
primaryCommandBuffer.endRenderPass();
primaryCommandBuffer.end();
submitQueue(presentCompleteSemaphore, primaryCommandBuffer);
}
If you have any idea on what am I missing / what i'm doing wrong, please let me know.
Aucun commentaire:
Enregistrer un commentaire