1
#include <cutil_inline.h>
2
#include <cv.h>
3
#include <cstdio>
4
#include <iostream>
5
#include <cutil.h>
6
#include <ctime>
7
#include <cstdlib>
8
#include <highgui.h>
9
#include <windows.h>
10
11
#pragma comment(lib, "cuda.lib")
12
#pragma comment(lib, "cudart.lib")
13
#pragma comment(lib, "cutil32.lib")
14
#pragma comment(lib, "cv.lib")
15
#pragma comment(lib, "cxcore.lib")
16
#pragma comment(lib, "highgui.lib")
17
18
using namespace std;
19
20
__global__ void main_kernel(uchar4* d_dataC, uchar4* d_dataA, uchar4* d_dataB, int width, int height)
21

{
22
unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;
23
unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;
24
float w = 0.2;
25
if( x>0 && x < width && y>0 && y < height )
26
{
27
d_dataC[y*width+x].x = (unsigned char)(w*d_dataA[y*width+x].x+(1-w)*d_dataB[y*width+x].x);
28
d_dataC[y*width+x].y = (unsigned char)(w*d_dataA[y*width+x].y+(1-w)*d_dataB[y*width+x].y);
29
d_dataC[y*width+x].z = (unsigned char)(w*d_dataA[y*width+x].z+(1-w)*d_dataB[y*width+x].z);
30
31
d_dataC[y*width+x].w = 0;
32
}
33
}
34
35
char *filenameA = "01.JPG";
36
char *filenameB = "02.JPG";
37
int main()
38

{
39
IplImage *imageA = cvLoadImage(filenameA);
40
IplImage *imageB = cvLoadImage(filenameB);
41
42
uchar4* h_image_dataA =(uchar4*)malloc( (imageA->width)*(imageA->height)*sizeof(uchar4) );
43
for(int i=0;i <imageA->width;i++)
44
{
45
for(int j = 0; j <imageA->height; j++)
46
{
47
h_image_dataA[j*(imageA->width)+i].x = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3];
48
h_image_dataA[j*(imageA->width)+i].y = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3+1];
49
h_image_dataA[j*(imageA->width)+i].z = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3+2];
50
h_image_dataA[j*(imageA->width)+i].w = 0;
51
}
52
}
53
54
uchar4* h_image_dataB =(uchar4*)malloc( (imageB->width)*(imageB->height)*sizeof(uchar4) );
55
for(int i=0;i <imageB->width;i++)
56
{
57
for(int j = 0; j <imageB->height; j++)
58
{
59
h_image_dataB[j*(imageB->width)+i].x = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3];
60
h_image_dataB[j*(imageB->width)+i].y = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3+1];
61
h_image_dataB[j*(imageB->width)+i].z = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3+2];
62
h_image_dataB[j*(imageB->width)+i].w = 0;
63
}
64
}
65
66
uchar4* d_dataA = NULL;
67
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataA, ( imageA->width*imageA->height*sizeof(uchar4) ) ) );
68
69
uchar4* d_dataB = NULL;
70
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataB, ( imageB->width*imageB->height*sizeof(uchar4) ) ) );
71
72
uchar4* d_dataC = NULL;
73
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataC, ( imageB->width*imageB->height*sizeof(uchar4) ) ) );
74
75
CUDA_SAFE_CALL( cudaMemcpy( d_dataA, h_image_dataA, imageA->width*imageA->height*sizeof(uchar4), cudaMemcpyHostToDevice) );
76
CUDA_SAFE_CALL( cudaMemcpy( d_dataB, h_image_dataB, imageB->width*imageB->height*sizeof(uchar4), cudaMemcpyHostToDevice) );
77
//
78
LARGE_INTEGER start_time;
79
LARGE_INTEGER end_time;
80
double start = GetTickCount();
81
QueryPerformanceCounter((LARGE_INTEGER*)&start_time);
82
dim3 dimBlock(16, 16, 1);
83
dim3 dimGrid( (imageA->width+dimBlock.x-1)/dimBlock.x, (imageA->height+dimBlock.y-1)/dimBlock.y );
84
main_kernel<<<dimGrid, dimBlock, 0>>>(d_dataC, d_dataA,d_dataB, imageA->width, imageA->height);
85
CUDA_SAFE_CALL(cudaThreadSynchronize());
86
float diff_time = 0.0f;
87
QueryPerformanceCounter((LARGE_INTEGER*)&end_time);
88
diff_time = (float)(((double)end_time.QuadPart - (double)start_time.QuadPart) / 1000000);
89
printf( "time=%f ms\n", diff_time);
90
91
double end = (GetTickCount() - start)/1000;
92
printf("time used:%f s\n",end);
93
94
IplImage* imageC = cvCreateImage(cvGetSize(imageA),imageA->depth, imageA->nChannels );
95
uchar4* h_image_dataC =(uchar4*)malloc( (imageC->width)*(imageC->height)*sizeof(uchar4) );
96
CUDA_SAFE_CALL( cudaMemcpy( h_image_dataC, d_dataC, imageC->width*imageC->height*sizeof(uchar4), cudaMemcpyDeviceToHost) );
97
for(int i=0;i <imageC->width;i++)
98
{
99
for(int j = 0; j <imageC->height; j++)
100
{
101
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3] = h_image_dataC[j*imageC->width+i].x;
102
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3+1] = h_image_dataC[j*imageC->width+i].y;
103
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3+2] = h_image_dataC[j*imageC->width+i].z;
104
}
105
}
106
107
cvNamedWindow("test",CV_WINDOW_AUTOSIZE);
108
cvShowImage("test",imageC);
109
110
cvWaitKey(0);
111
cvDestroyAllWindows();
112
113
CUDA_SAFE_CALL(cudaFree(d_dataA));
114
CUDA_SAFE_CALL(cudaFree(d_dataB));
115
CUDA_SAFE_CALL(cudaFree(d_dataC));
116
117
free(h_image_dataA);
118
free(h_image_dataB);
119
free(h_image_dataC);
120
cvReleaseImage(&imageA);
121
cvReleaseImage(&imageB);
122
cvReleaseImage(&imageC);
123
return 0;
124
}
#include <cutil_inline.h>2
#include <cv.h>3
#include <cstdio>4
#include <iostream>5
#include <cutil.h>6
#include <ctime>7
#include <cstdlib>8
#include <highgui.h>9
#include <windows.h>10

11
#pragma comment(lib, "cuda.lib")12
#pragma comment(lib, "cudart.lib")13
#pragma comment(lib, "cutil32.lib")14
#pragma comment(lib, "cv.lib")15
#pragma comment(lib, "cxcore.lib")16
#pragma comment(lib, "highgui.lib")17

18
using namespace std;19

20
__global__ void main_kernel(uchar4* d_dataC, uchar4* d_dataA, uchar4* d_dataB, int width, int height)21


{22
unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;23
unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;24
float w = 0.2;25
if( x>0 && x < width && y>0 && y < height )26

{27
d_dataC[y*width+x].x = (unsigned char)(w*d_dataA[y*width+x].x+(1-w)*d_dataB[y*width+x].x);28
d_dataC[y*width+x].y = (unsigned char)(w*d_dataA[y*width+x].y+(1-w)*d_dataB[y*width+x].y);29
d_dataC[y*width+x].z = (unsigned char)(w*d_dataA[y*width+x].z+(1-w)*d_dataB[y*width+x].z);30

31
d_dataC[y*width+x].w = 0;32
}33
}34

35
char *filenameA = "01.JPG";36
char *filenameB = "02.JPG";37
int main()38


{39
IplImage *imageA = cvLoadImage(filenameA);40
IplImage *imageB = cvLoadImage(filenameB);41

42
uchar4* h_image_dataA =(uchar4*)malloc( (imageA->width)*(imageA->height)*sizeof(uchar4) );43
for(int i=0;i <imageA->width;i++) 44

{45
for(int j = 0; j <imageA->height; j++)46

{ 47
h_image_dataA[j*(imageA->width)+i].x = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3]; 48
h_image_dataA[j*(imageA->width)+i].y = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3+1]; 49
h_image_dataA[j*(imageA->width)+i].z = ((unsigned char*)(imageA->imageData+imageA->widthStep*j))[i*3+2]; 50
h_image_dataA[j*(imageA->width)+i].w = 0; 51
}52
}53

54
uchar4* h_image_dataB =(uchar4*)malloc( (imageB->width)*(imageB->height)*sizeof(uchar4) );55
for(int i=0;i <imageB->width;i++) 56

{57
for(int j = 0; j <imageB->height; j++)58

{ 59
h_image_dataB[j*(imageB->width)+i].x = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3]; 60
h_image_dataB[j*(imageB->width)+i].y = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3+1]; 61
h_image_dataB[j*(imageB->width)+i].z = ((unsigned char*)(imageB->imageData+imageB->widthStep*j))[i*3+2]; 62
h_image_dataB[j*(imageB->width)+i].w = 0; 63
}64
}65

66
uchar4* d_dataA = NULL;67
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataA, ( imageA->width*imageA->height*sizeof(uchar4) ) ) );68

69
uchar4* d_dataB = NULL;70
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataB, ( imageB->width*imageB->height*sizeof(uchar4) ) ) );71

72
uchar4* d_dataC = NULL;73
CUDA_SAFE_CALL( cudaMalloc( (void**)&d_dataC, ( imageB->width*imageB->height*sizeof(uchar4) ) ) );74
75
CUDA_SAFE_CALL( cudaMemcpy( d_dataA, h_image_dataA, imageA->width*imageA->height*sizeof(uchar4), cudaMemcpyHostToDevice) );76
CUDA_SAFE_CALL( cudaMemcpy( d_dataB, h_image_dataB, imageB->width*imageB->height*sizeof(uchar4), cudaMemcpyHostToDevice) );77
//78
LARGE_INTEGER start_time;79
LARGE_INTEGER end_time;80
double start = GetTickCount();81
QueryPerformanceCounter((LARGE_INTEGER*)&start_time); 82
dim3 dimBlock(16, 16, 1);83
dim3 dimGrid( (imageA->width+dimBlock.x-1)/dimBlock.x, (imageA->height+dimBlock.y-1)/dimBlock.y );84
main_kernel<<<dimGrid, dimBlock, 0>>>(d_dataC, d_dataA,d_dataB, imageA->width, imageA->height);85
CUDA_SAFE_CALL(cudaThreadSynchronize());86
float diff_time = 0.0f;87
QueryPerformanceCounter((LARGE_INTEGER*)&end_time);88
diff_time = (float)(((double)end_time.QuadPart - (double)start_time.QuadPart) / 1000000);89
printf( "time=%f ms\n", diff_time);90
91
double end = (GetTickCount() - start)/1000;92
printf("time used:%f s\n",end);93

94
IplImage* imageC = cvCreateImage(cvGetSize(imageA),imageA->depth, imageA->nChannels );95
uchar4* h_image_dataC =(uchar4*)malloc( (imageC->width)*(imageC->height)*sizeof(uchar4) );96
CUDA_SAFE_CALL( cudaMemcpy( h_image_dataC, d_dataC, imageC->width*imageC->height*sizeof(uchar4), cudaMemcpyDeviceToHost) );97
for(int i=0;i <imageC->width;i++) 98

{99
for(int j = 0; j <imageC->height; j++)100

{ 101
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3] = h_image_dataC[j*imageC->width+i].x; 102
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3+1] = h_image_dataC[j*imageC->width+i].y; 103
((unsigned char*)(imageC->imageData+imageC->widthStep*j))[i*3+2] = h_image_dataC[j*imageC->width+i].z;104
} 105
}106

107
cvNamedWindow("test",CV_WINDOW_AUTOSIZE);108
cvShowImage("test",imageC);109

110
cvWaitKey(0);111
cvDestroyAllWindows();112

113
CUDA_SAFE_CALL(cudaFree(d_dataA));114
CUDA_SAFE_CALL(cudaFree(d_dataB));115
CUDA_SAFE_CALL(cudaFree(d_dataC));116

117
free(h_image_dataA);118
free(h_image_dataB);119
free(h_image_dataC);120
cvReleaseImage(&imageA);121
cvReleaseImage(&imageB);122
cvReleaseImage(&imageC);123
return 0;124
}

