cudaMalloc/cudaFreeをcuda DLLの外側から呼び出す

2022年6月24日公開
2022年6月24日

mallocしたメモリはfreeするまで解放されない。それは知っている。

知っているが、CUDAプログラムをdll化した場合、まさかdll内関数から出た瞬間に自動解放されたりしないだろうな・・・？

という不安が頭をよぎったのでテスト。

以下、gpu_my_allocやdevice_to_hostなどは全てcuファイル内に書かれている。

C++側から別個に呼び出してもちゃんと動作するらしい。

C++側

#include <iostream>

#pragma warning(disable:4996)

#include "../CudaRuntime1/mytest.h"

#pragma comment(lib,"CudaRuntime1.lib")

void pnmP3_Write(const char* const fname, const int vmax, const int width, const int height, const unsigned char* const p);

int main()
{
  data_t dat;

  dat.width = 100;
  dat.height = 50;

  unsigned char *c = new unsigned char[dat.width * dat.height * 3];

  for (size_t i = 0; i < dat.width * dat.height; i++) {
    c[i * 3 + 0] = 0;
    c[i * 3 + 1] = 0;
    c[i * 3 + 2] = 255;
  }

  dat.rgbdata = c;




  //GPU側メモリ確保
  void* device = gpu_my_alloc(dat.width, dat.height);

  //GPU側へデータ転送
  host_to_device(&dat, device);

  //処理実行
  func_inverse(dat.width, dat.height, device);

  //CPU側へ結果を返却
  device_to_host(device, &dat);

  //GPU側のメモリ解放
  gpu_my_free(device);


  pnmP3_Write("test.ppm", 255, dat.width, dat.height, dat.rgbdata);

}

/////////////////////////////////////////////
//画像ファイル書き出し/////////////////////////
//! @brief PPM(RGB各1byte,カラー,テキスト)を書き込む
//! @param [in] fname ファイル名
//! @param [in] vmax 全てのRGBの中の最大値
//! @param [in] width 画像の幅
//! @param [in] height 画像の高さ
//! @param [in] p 画像のメモリへのアドレス
//! @details RGBRGBRGB....のメモリを渡すと、RGBテキストでファイル名fnameで書き込む
void pnmP3_Write(const char* const fname, const int vmax, const int width, const int height, const unsigned char* const p) { // PPM ASCII

  FILE* fp = fopen(fname, "wb");
  fprintf(fp, "P3\n%d %d\n%d\n", width, height, vmax);

  size_t k = 0;
  for (size_t i = 0; i < (size_t)height; i++) {
    for (size_t j = 0; j < (size_t)width; j++) {
      fprintf(fp, "%d %d %d ", p[k * 3 + 0], p[k * 3 + 1], p[k * 3 + 2]);
      k++;
    }
    fprintf(fp, "\n");
  }

  fclose(fp);
}

CUDA側

mytest.h

#ifdef __DLL_EXPORT_DO
#define DLL_PORT extern "C" _declspec(dllexport)
#else
#define DLL_PORT extern "C" _declspec(dllimport)
#endif

struct data_t {
  int width;
  int height;
  unsigned char* rgbdata;
};

//GPU側メモリ確保
DLL_PORT void* gpu_my_alloc(int width, int height);

//処理実行
DLL_PORT void func_inverse(int width,int height, void* device);

//GPU側へデータ転送
DLL_PORT void host_to_device(data_t* host, void* device);

//CPU側へ結果を返却
DLL_PORT void device_to_host(void* device, data_t* host);

//GPU側のメモリ解放
DLL_PORT void gpu_my_free(void* gpuptr);

mytest.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "mytest.h"

struct gpudata {
  int width;
  int height;
  unsigned char* c;
};

__device__
void color_inverse(unsigned char* c, int width, int height) {
  c[0] = 255 - c[0];
  c[1] = 255 - c[1];
  c[2] = 255 - c[2];
}

__global__ void thread_inverse(gpudata data) {
  //このスレッドが担当する画素の位置を二次元座標で求める
  size_t xpos = blockIdx.x * blockDim.x + threadIdx.x;
  size_t ypos = blockIdx.y * blockDim.y + threadIdx.y;

  if (xpos < data.width && ypos < data.height) {
    size_t pos = (ypos * data.width + xpos) * 3;

    unsigned char* c = data.c + pos;
    // この関数はfunction.cuで定義されている
    color_inverse(c, xpos, ypos);
  }

}

void func_inverse(int width, int height, void* device){

  // 16*16 == 256 < 512
  int blockw = 16;
  int blockh = 16;
  dim3 block(blockw, blockh);

  int gridw = width / blockw + 1;
  int gridh = height / blockh + 1;

  dim3 grid(gridw,gridh);



  gpudata gpud;
  gpud.width = width;
  gpud.height = height;
  gpud.c = (unsigned char*)device;
  thread_inverse<<<grid,block>>> (gpud);//GPU側の関数を呼出


}

void host_to_device(data_t* host, void* device) {
  cudaMemcpy(
    device,
    host->rgbdata,
    host->width * host->height * 3,
    cudaMemcpyHostToDevice);//GPU側へ処理したいデータを転送

}

void device_to_host(void* device, data_t* host) {
  cudaMemcpy(
    host->rgbdata,
    device,
    host->width * host->height * 3,
    cudaMemcpyDeviceToHost);//GPU側から実行結果を取得

}


void* gpu_my_alloc(int width, int height) {
  unsigned char* g_gpu;
  cudaMalloc(
    (void**)&g_gpu, 
    width * height * 3);//GPU側メモリ確保

  return g_gpu;
}

void gpu_my_free(void* gpuptr) {
  cudaFree(gpuptr);//GPU側のメモリを解放
}

コメントを残すコメントをキャンセル

この記事のトラックバックURL：