前置条件
模拟fps为30、1920*1080、一分钟的视频
项目CMakeLists.txt
- cmake_minimum_required(VERSION 3.30)
- project(testOpenGl)
- set(CMAKE_CXX_STANDARD 11)
- add_executable(testOpenGl main.cpp
- testOpenCl.cpp
- testOpenCl.h
- TestCpp.cpp
- TestCpp.h
- TestCppThread.cpp
- TestCppThread.h
- TestSIMD.cpp
- TestSIMD.h)
- # 查找OpenCL
- find_package(OpenCL REQUIRED)
- # 链接OpenCl库
- target_include_directories(testOpenGl PRIVATE ${OpenCL_INCLUDE_DIRS})
- target_link_libraries(testOpenGl PRIVATE ${OpenCL_LIBRARIES})
- # 检测SIMD支持并添加编译选项
- include(CheckCXXCompilerFlag)
- check_cxx_compiler_flag("-mavx" COMPILER_SUPPORTS_AVX)
- check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
- if(COMPILER_SUPPORTS_AVX2)
- target_compile_options(testOpenGl PRIVATE -mavx2)
- elseif (COMPILER_SUPPORTS_AVX)
- target_compile_options(testOpenGl PRIVATE -mavx)
- else ()
- message(FATAL_ERROR "AVX or AVX2 is not supported by compiler")
- endif ()
复制代码 C++代码
- //
- // Created by lai on 2025/1/17.
- //
- #include "TestCpp.h"
- #include <iostream>
- #include <vector>
- #include <random>
- #include <chrono>
- // 灰度转换函数
- void to_gray(const std::vector<unsigned char>& input, std::vector<unsigned char>& output, int width, int height) {
- for (int i = 0; i < width * height; ++i) {
- int offset = i * 3; // RGB 分量
- unsigned char r = input[offset];
- unsigned char g = input[offset + 1];
- unsigned char b = input[offset + 2];
- // 灰度公式
- output[i] = static_cast<unsigned char>(0.299f * r + 0.587f * g + 0.114f * b);
- }
- }
- void TestCpp::runTest() {
- const int width = 1920; // 视频宽度
- const int height = 1080; // 视频高度
- const int fps = 30; // 帧率
- const int duration = 60; // 视频持续时间(秒)
- const int frameCount = fps * duration; // 总帧数
- // 模拟视频帧数据:随机生成每帧的 RGB 数据
- std::vector<unsigned char> inputFrame(width * height * 3);
- std::vector<unsigned char> outputFrame(width * height);
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<> dis(0, 255);
- // 开始处理
- auto startTime = std::chrono::high_resolution_clock::now();
- for (int frame = 0; frame < frameCount; ++frame) {
- // 随机生成模拟的 RGB 数据
- for (auto& pixel : inputFrame) {
- pixel = dis(gen);
- }
- // 调用灰度转换函数
- to_gray(inputFrame, outputFrame, width, height);
- // 打印进度
- if (frame % 30 == 0) {
- std::cout << "Processed frame: " << frame + 1 << "/" << frameCount << std::endl;
- }
- }
- auto endTime = std::chrono::high_resolution_clock::now();
- double elapsedTime = std::chrono::duration<double>(endTime - startTime).count();
- // 打印处理时间
- std::cout << "Processed " << frameCount << " frames in " << elapsedTime << " seconds." << std::endl;
- std::cout << "Average time per frame: " << (elapsedTime / frameCount) << " seconds." << std::endl;
- }
复制代码 C++多线程
- //
- // Created by lai on 2025/1/17.
- //
- #include "TestCppThread.h"
- #include <iostream>
- #include <vector>
- #include <random>
- #include <chrono>
- #include <thread>
- // 灰度转换函数,每个线程处理一部分图像
- void to_gray_chunk(const std::vector<unsigned char>& input, std::vector<unsigned char>& output, int width, int height, int start, int end) {
- for (int i = start; i < end; ++i) {
- int offset = i * 3; // RGB 分量
- unsigned char r = input[offset];
- unsigned char g = input[offset + 1];
- unsigned char b = input[offset + 2];
- // 灰度公式
- output[i] = static_cast<unsigned char>(0.299f * r + 0.587f * g + 0.114f * b);
- }
- }
- void TestCppThread::runTest() {
- const int width = 1920; // 视频宽度
- const int height = 1080; // 视频高度
- const int fps = 30; // 帧率
- const int duration = 60; // 视频持续时间(秒)
- const int frameCount = fps * duration; // 总帧数
- const int numThreads = std::thread::hardware_concurrency(); // 获取可用线程数
- // 模拟视频帧数据:随机生成每帧的 RGB 数据
- std::vector<unsigned char> inputFrame(width * height * 3);
- std::vector<unsigned char> outputFrame(width * height);
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<> dis(0, 255);
- // 开始处理
- auto startTime = std::chrono::high_resolution_clock::now();
- for (int frame = 0; frame < frameCount; ++frame) {
- // 随机生成模拟的 RGB 数据
- for (auto& pixel : inputFrame) {
- pixel = dis(gen);
- }
- // 启动多个线程来处理图像
- std::vector<std::thread> threads;
- int chunkSize = width * height / numThreads; // 每个线程处理的像素块大小
- for (int t = 0; t < numThreads; ++t) {
- int start = t * chunkSize;
- int end = (t == numThreads - 1) ? (width * height) : (start + chunkSize); // 最后一个线程处理剩余的像素
- threads.emplace_back(to_gray_chunk, std::cref(inputFrame), std::ref(outputFrame), width, height, start, end);
- }
- // 等待所有线程完成
- for (auto& t : threads) {
- t.join();
- }
- // 打印进度
- if (frame % 30 == 0) {
- std::cout << "Processed frame: " << frame + 1 << "/" << frameCount << std::endl;
- }
- }
- auto endTime = std::chrono::high_resolution_clock::now();
- double elapsedTime = std::chrono::duration<double>(endTime - startTime).count();
- // 打印处理时间
- std::cout << "Processed " << frameCount << " frames in " << elapsedTime << " seconds." << std::endl;
- std::cout << "Average time per frame: " << (elapsedTime / frameCount) << " seconds." << std::endl;
- }
复制代码 CPU版本的Opencl
cmake中添加
- # 查找OpenCL
- find_package(OpenCL REQUIRED)
- # 链接OpenCl库
- target_include_directories(testOpenGl PRIVATE ${OpenCL_INCLUDE_DIRS})
- target_link_libraries(testOpenGl PRIVATE ${OpenCL_LIBRARIES})
复制代码 测试代码
- //
- // Created by lai on 2025/1/16.
- //
- #include "testOpenCl.h"
- #include <chrono>
- #include <CL/cl.h>
- #include <iostream>
- #include <vector>
- #include <random>
- // OpenCL 内核代码
- const char* kernelSource = R"(
- __kernel void to_gray(
- __global unsigned char* input,
- __global unsigned char* output,
- const int width,
- const int height)
- {
- int id = get_global_id(0); // 每个线程处理一个像素
- if (id < width * height) {
- int offset = id * 3; // RGB 分量
- unsigned char r = input[offset];
- unsigned char g = input[offset + 1];
- unsigned char b = input[offset + 2];
- // 灰度公式
- output[id] = (unsigned char)(0.299f * r + 0.587f * g + 0.114f * b);
- }
- }
- )";
- void TestOpenCl::runTests() {
- const int width = 1920; // 视频宽度
- const int height = 1080; // 视频高度
- const int fps = 30; // 帧率
- const int duration = 60; // 视频持续时间(秒)
- const int frameCount = fps * duration; // 总帧数
- // 模拟视频帧数据:随机生成每帧的 RGB 数据
- std::vector<unsigned char> inputFrame(width * height * 3);
- std::vector<unsigned char> outputFrame(width * height);
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<> dis(0, 255);
- // 初始化 OpenCL
- cl_int err;
- cl_platform_id platform;
- clGetPlatformIDs(1, &platform, nullptr);
- cl_device_id device;
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, nullptr);
- cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
- cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
- cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, nullptr, &err);
- clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
- cl_kernel kernel = clCreateKernel(program, "to_gray", &err);
- // 创建 OpenCL 缓冲区
- cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, inputFrame.size(), nullptr, &err);
- cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, outputFrame.size(), nullptr, &err);
- // 开始处理
- auto startTime = std::chrono::high_resolution_clock::now();
- for (int frame = 0; frame < frameCount; ++frame) {
- // 随机生成模拟的 RGB 数据
- for (auto& pixel : inputFrame) {
- pixel = dis(gen);
- }
- // 写入数据到 OpenCL 缓冲区
- clEnqueueWriteBuffer(queue, inputBuffer, CL_TRUE, 0, inputFrame.size(), inputFrame.data(), 0, nullptr, nullptr);
- // 设置内核参数
- clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
- clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer);
- clSetKernelArg(kernel, 2, sizeof(int), &width);
- clSetKernelArg(kernel, 3, sizeof(int), &height);
- // 定义工作区大小
- size_t globalSize = width * height;
- // 执行内核
- clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, &globalSize, nullptr, 0, nullptr, nullptr);
- // 读取处理后的灰度数据
- clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, 0, outputFrame.size(), outputFrame.data(), 0, nullptr, nullptr);
- // 打印进度
- if (frame % 30 == 0) {
- std::cout << "Processed frame: " << frame + 1 << "/" << frameCount << std::endl;
- }
- }
- auto endTime = std::chrono::high_resolution_clock::now();
- double elapsedTime = std::chrono::duration<double>(endTime - startTime).count();
- // 打印处理时间
- std::cout << "Processed " << frameCount << " frames in " << elapsedTime << " seconds." << std::endl;
- std::cout << "Average time per frame: " << (elapsedTime / frameCount) << " seconds." << std::endl;
- // 释放 OpenCL 资源
- clReleaseMemObject(inputBuffer);
- clReleaseMemObject(outputBuffer);
- clReleaseKernel(kernel);
- clReleaseProgram(program);
- clReleaseCommandQueue(queue);
- clReleaseContext(context);
- }
复制代码 内存对齐的SIMD指令集
cmake添加
- # 检测SIMD支持并添加编译选项
- include(CheckCXXCompilerFlag)
- check_cxx_compiler_flag("-mavx" COMPILER_SUPPORTS_AVX)
- check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
- if(COMPILER_SUPPORTS_AVX2)
- target_compile_options(testOpenGl PRIVATE -mavx2)
- elseif (COMPILER_SUPPORTS_AVX)
- target_compile_options(testOpenGl PRIVATE -mavx)
- else ()
- message(FATAL_ERROR "AVX or AVX2 is not supported by compiler")
- endif ()
复制代码- //
- // Created by lai on 2025/1/17.
- //
- #include "TestSIMD.h"
- #include <iostream>
- #include <vector>
- #include <random>
- #include <chrono>
- #include <immintrin.h> // SIMD 指令集
- #include <cstdlib> // 用于posix_memalign
- void to_gray_simd(const unsigned char* input, unsigned char* output, int width, int height) {
- const int pixelCount = width * height;
- const __m256 scale_r = _mm256_set1_ps(0.299f); // 红色通道的权重
- const __m256 scale_g = _mm256_set1_ps(0.587f); // 绿色通道的权重
- const __m256 scale_b = _mm256_set1_ps(0.114f); // 蓝色通道的权重
- int i = 0;
- for (; i <= pixelCount - 8; i += 8) {
- // 加载 8 组 RGB 像素
- __m256i pixel_r = _mm256_loadu_si256((__m256i*)&input[i * 3]); // 确保内存对齐
- __m256i pixel_g = _mm256_loadu_si256((__m256i*)&input[i * 3 + 1]);
- __m256i pixel_b = _mm256_loadu_si256((__m256i*)&input[i * 3 + 2]);
- // 转换为浮点数以便计算
- __m256 r_f = _mm256_cvtepi32_ps(pixel_r);
- __m256 g_f = _mm256_cvtepi32_ps(pixel_g);
- __m256 b_f = _mm256_cvtepi32_ps(pixel_b);
- // 灰度转换公式
- __m256 gray_f = _mm256_add_ps(
- _mm256_add_ps(_mm256_mul_ps(r_f, scale_r), _mm256_mul_ps(g_f, scale_g)),
- _mm256_mul_ps(b_f, scale_b));
- // 转回整数
- __m256i gray_i = _mm256_cvtps_epi32(gray_f);
- // 存储结果
- _mm256_storeu_si256((__m256i*)&output[i], gray_i);
- }
- // 处理剩余像素(非对齐部分)
- for (; i < pixelCount; ++i) {
- int offset = i * 3;
- unsigned char r = input[offset];
- unsigned char g = input[offset + 1];
- unsigned char b = input[offset + 2];
- output[i] = static_cast<unsigned char>(0.299f * r + 0.587f * g + 0.114f * b);
- }
- }
- void TestSIMD::runTest() {
- const int width = 1920; // 视频宽度
- const int height = 1080; // 视频高度
- const int fps = 30; // 帧率
- const int duration = 60; // 视频持续时间(秒)
- const int frameCount = fps * duration; // 总帧数
- size_t size = width * height * 3 * sizeof(unsigned char);
- // 模拟视频帧数据:随机生成每帧的 RGB 数据
- // 使用posix_memalign分配对齐内存
- unsigned char* inputFrame;
- unsigned char* outputFrame;
- int alignment = 32; // 使用32字节对齐
- int resultInput = posix_memalign((void**)&inputFrame, alignment, size);
- int resultOutput = posix_memalign((void**)&outputFrame, alignment, size);
- if (resultInput != 0 || resultOutput != 0) {
- std::cerr << "memory allocation failed" << std::endl;
- return;
- }
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_int_distribution<> dis(0, 255);
- // 开始处理
- auto startTime = std::chrono::high_resolution_clock::now();
- for (int frame = 0; frame < frameCount; ++frame) {
- // 随机生成模拟的 RGB 数据
- for (int i = 0; i < width * height * 3; ++i) {
- inputFrame[i] = dis(gen);
- }
- // 使用 SIMD 转换灰度
- to_gray_simd(inputFrame, outputFrame, width, height);
- // 打印进度
- if (frame % 30 == 0) {
- std::cout << "Processed frame: " << frame + 1 << "/" << frameCount << std::endl;
- }
- }
- auto endTime = std::chrono::high_resolution_clock::now();
- double elapsedTime = std::chrono::duration<double>(endTime - startTime).count();
- // 打印处理时间
- std::cout << "Processed " << frameCount << " frames in " << elapsedTime << " seconds." << std::endl;
- std::cout << "Average time per frame: " << (elapsedTime / frameCount) << " seconds." << std::endl;
- }
复制代码 结论
- C++
- Processed 1800 frames in 251.789 seconds.
- Average time per frame: 0.139883 seconds.
- C++ thread
- Processed 1800 frames in 229.571 seconds.
- Average time per frame: 0.12754 seconds.
- CPU版本POCL的OPENCL
- Processed 1800 frames in 233.25 seconds.
- Average time per frame: 0.129583 seconds.
- SIMD 内存对齐以后
- Processed 1800 frames in 191.015 seconds.
- Average time per frame: 0.106119 seconds.
复制代码 SIMD的性能明显由于其他几项,但是还需要测试GPU版本的OPencl和多线程指令集优化对性能的提拔
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |