Actual source code: sveccuda.cu
slepc-3.12.2 2020-01-13
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <petsccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscBLASInt m,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: WaitForGPU();CHKERRCUDA(ierr);
47: PetscLogGpuFlops(2.0*m);
48: return(0);
49: }
51: /*
52: C := alpha*A*B + beta*C
53: */
54: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
55: {
56: PetscErrorCode ierr;
57: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
58: const PetscScalar *d_px,*d_A;
59: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
60: PetscInt ldq,mq;
61: PetscBLASInt m,n,k,ldq_;
62: cublasStatus_t cberr;
63: cudaError_t err;
64: cublasHandle_t cublasv2handle;
67: if (!Y->n) return(0);
68: VecCUDAGetArrayRead(x->v,&d_px);
69: if (beta==(PetscScalar)0.0) {
70: VecCUDAGetArrayWrite(y->v,&d_py);
71: } else {
72: VecCUDAGetArray(y->v,&d_py);
73: }
74: d_A = d_px+(X->nc+X->l)*X->n;
75: d_C = d_py+(Y->nc+Y->l)*Y->n;
76: if (Q) {
77: PetscBLASIntCast(Y->n,&m);
78: PetscBLASIntCast(Y->k-Y->l,&n);
79: PetscBLASIntCast(X->k-X->l,&k);
80: PetscCUBLASGetHandle(&cublasv2handle);
81: MatGetSize(Q,&ldq,&mq);
82: PetscBLASIntCast(ldq,&ldq_);
83: MatDenseGetArray(Q,&q);
84: err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
85: err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
86: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
87: d_B = d_q+Y->l*ldq+X->l;
88: PetscLogGpuTimeBegin();
89: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
90: PetscLogGpuTimeEnd();
91: WaitForGPU();CHKERRCUDA(ierr);
92: MatDenseRestoreArray(Q,&q);
93: err = cudaFree(d_q);CHKERRCUDA(err);
94: PetscLogGpuFlops(2.0*m*n*k);
95: } else {
96: BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
97: }
98: VecCUDARestoreArrayRead(x->v,&d_px);
99: VecCUDARestoreArrayWrite(y->v,&d_py);
100: return(0);
101: }
103: /*
104: y := alpha*A*x + beta*y
105: */
106: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
107: {
108: PetscErrorCode ierr;
109: BV_SVEC *x = (BV_SVEC*)X->data;
110: const PetscScalar *d_px,*d_A;
111: PetscScalar *d_py,*d_q,*d_x,*d_y;
112: PetscBLASInt n,k,one=1;
113: cublasStatus_t cberr;
114: cublasHandle_t cublasv2handle;
117: PetscBLASIntCast(X->n,&n);
118: PetscBLASIntCast(X->k-X->l,&k);
119: PetscCUBLASGetHandle(&cublasv2handle);
120: VecCUDAGetArrayRead(x->v,&d_px);
121: if (beta==(PetscScalar)0.0) {
122: VecCUDAGetArrayWrite(y,&d_py);
123: } else {
124: VecCUDAGetArray(y,&d_py);
125: }
126: if (!q) {
127: VecCUDAGetArray(X->buffer,&d_q);
128: } else {
129: cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
130: cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
131: PetscLogCpuToGpu(k*sizeof(PetscScalar));
132: }
133: d_A = d_px+(X->nc+X->l)*X->n;
134: d_x = d_q;
135: d_y = d_py;
136: PetscLogGpuTimeBegin();
137: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
138: PetscLogGpuTimeEnd();
139: WaitForGPU();CHKERRCUDA(ierr);
140: VecCUDARestoreArrayRead(x->v,&d_px);
141: if (beta==(PetscScalar)0.0) {
142: VecCUDARestoreArrayWrite(y,&d_py);
143: } else {
144: VecCUDARestoreArray(y,&d_py);
145: }
146: if (!q) {
147: VecCUDARestoreArray(X->buffer,&d_q);
148: } else {
149: cudaFree(d_q);
150: }
151: PetscLogGpuFlops(2.0*n*k);
152: return(0);
153: }
155: /*
156: A(:,s:e-1) := A*B(:,s:e-1)
157: */
158: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
159: {
161: BV_SVEC *ctx = (BV_SVEC*)V->data;
162: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
163: PetscInt j,ldq,nq;
164: PetscBLASInt m,n,k,l,ldq_,bs=BLOCKSIZE;
165: cublasStatus_t cberr;
166: size_t freemem,totmem;
167: cublasHandle_t cublasv2handle;
170: if (!V->n) return(0);
171: PetscBLASIntCast(V->n,&m);
172: PetscBLASIntCast(e-s,&n);
173: PetscBLASIntCast(V->k-V->l,&k);
174: MatGetSize(Q,&ldq,&nq);
175: PetscBLASIntCast(ldq,&ldq_);
176: VecCUDAGetArray(ctx->v,&d_pv);
177: MatDenseGetArray(Q,&q);
178: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
179: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
180: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
181: PetscCUBLASGetHandle(&cublasv2handle);
182: PetscLogGpuTimeBegin();
183: /* try to allocate the whole matrix */
184: cudaMemGetInfo(&freemem,&totmem);
185: if (freemem>=m*n*sizeof(PetscScalar)) {
186: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
187: d_A = d_pv+(V->nc+V->l)*m;
188: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
190: for (j=0;j<n;j++) {
191: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
192: }
193: } else {
194: bs = freemem/(m*sizeof(PetscScalar));
195: cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
196: l = m % bs;
197: if (l) {
198: d_A = d_pv+(V->nc+V->l)*m;
199: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
200: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
201: for (j=0;j<n;j++) {
202: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
203: }
204: }
205: for (;l<m;l+=bs) {
206: d_A = d_pv+(V->nc+V->l)*m+l;
207: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
208: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
209: for (j=0;j<n;j++) {
210: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
211: }
212: }
213: }
214: PetscLogGpuTimeEnd();
215: WaitForGPU();CHKERRCUDA(ierr);
216: MatDenseRestoreArray(Q,&q);
217: cudaFree(d_q);
218: cudaFree(d_work);
219: VecCUDARestoreArray(ctx->v,&d_pv);
220: PetscLogGpuFlops(2.0*m*n*k);
221: return(0);
222: }
224: /*
225: A(:,s:e-1) := A*B(:,s:e-1)
226: */
227: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
228: {
230: BV_SVEC *ctx = (BV_SVEC*)V->data;
231: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
232: PetscInt j,ldq,nq;
233: PetscBLASInt m,n,k,ldq_;
234: cublasStatus_t cberr;
235: cublasHandle_t cublasv2handle;
238: if (!V->n) return(0);
239: PetscBLASIntCast(V->n,&m);
240: PetscBLASIntCast(e-s,&n);
241: PetscBLASIntCast(V->k-V->l,&k);
242: MatGetSize(Q,&ldq,&nq);
243: PetscBLASIntCast(ldq,&ldq_);
244: VecCUDAGetArray(ctx->v,&d_pv);
245: MatDenseGetArray(Q,&q);
246: PetscCUBLASGetHandle(&cublasv2handle);
247: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
248: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
249: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250: PetscLogGpuTimeBegin();
251: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
252: d_A = d_pv+(V->nc+V->l)*m;
253: d_B = d_q+V->l*ldq+s;
254: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255: for (j=0;j<n;j++) {
256: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
257: }
258: PetscLogGpuTimeEnd();
259: WaitForGPU();CHKERRCUDA(ierr);
260: MatDenseRestoreArray(Q,&q);
261: cudaFree(d_q);
262: cudaFree(d_work);
263: VecCUDARestoreArray(ctx->v,&d_pv);
264: PetscLogGpuFlops(2.0*m*n*k);
265: return(0);
266: }
268: /*
269: C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273: PetscErrorCode ierr;
274: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275: const PetscScalar *d_px,*d_py,*d_A,*d_B;
276: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277: PetscInt j,ldm;
278: PetscBLASInt m,n,k,ldm_;
279: PetscMPIInt len;
280: cublasStatus_t cberr;
281: cublasHandle_t cublasv2handle;
284: PetscBLASIntCast(Y->k-Y->l,&m);
285: PetscBLASIntCast(X->k-X->l,&n);
286: PetscBLASIntCast(X->n,&k);
287: MatGetSize(M,&ldm,NULL);
288: PetscBLASIntCast(ldm,&ldm_);
289: VecCUDAGetArrayRead(x->v,&d_px);
290: VecCUDAGetArrayRead(y->v,&d_py);
291: MatDenseGetArray(M,&pm);
292: PetscCUBLASGetHandle(&cublasv2handle);
293: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
294: d_A = d_py+(Y->nc+Y->l)*Y->n;
295: d_B = d_px+(X->nc+X->l)*X->n;
296: C = pm+X->l*ldm+Y->l;
297: if (x->mpi) {
298: if (ldm==m) {
299: BVAllocateWork_Private(X,m*n);
300: if (k) {
301: PetscLogGpuTimeBegin();
302: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
303: PetscLogGpuTimeEnd();
304: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
305: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
306: } else {
307: PetscArrayzero(X->work,m*n);
308: }
309: PetscMPIIntCast(m*n,&len);
310: MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
311: } else {
312: BVAllocateWork_Private(X,2*m*n);
313: CC = X->work+m*n;
314: if (k) {
315: PetscLogGpuTimeBegin();
316: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
317: PetscLogGpuTimeEnd();
318: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
319: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
320: } else {
321: PetscArrayzero(X->work,m*n);
322: }
323: PetscMPIIntCast(m*n,&len);
324: MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
325: for (j=0;j<n;j++) {
326: PetscArraycpy(C+j*ldm,CC+j*m,m);
327: }
328: }
329: } else {
330: if (k) {
331: BVAllocateWork_Private(X,m*n);
332: PetscLogGpuTimeBegin();
333: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
334: PetscLogGpuTimeEnd();
335: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
336: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
337: for (j=0;j<n;j++) {
338: PetscArraycpy(C+j*ldm,X->work+j*m,m);
339: }
340: }
341: }
342: WaitForGPU();CHKERRCUDA(ierr);
343: cudaFree(d_work);
344: MatDenseRestoreArray(M,&pm);
345: VecCUDARestoreArrayRead(x->v,&d_px);
346: VecCUDARestoreArrayRead(y->v,&d_py);
347: PetscLogGpuFlops(2.0*m*n*k);
348: return(0);
349: }
351: #if defined(PETSC_USE_COMPLEX)
352: struct conjugate
353: {
354: __host__ __device__
355: PetscScalar operator()(PetscScalar x)
356: {
357: return PetscConj(x);
358: }
359: };
361: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
362: {
363: PetscErrorCode ierr;
364: thrust::device_ptr<PetscScalar> ptr;
367: try {
368: ptr = thrust::device_pointer_cast(a);
369: thrust::transform(ptr,ptr+n,ptr,conjugate());
370: WaitForGPU();CHKERRCUDA(ierr);
371: } catch (char *ex) {
372: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
373: }
374: return(0);
375: }
376: #endif
378: /*
379: y := A'*x computed as y' := x'*A
380: */
381: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
382: {
383: PetscErrorCode ierr;
384: BV_SVEC *x = (BV_SVEC*)X->data;
385: const PetscScalar *d_A,*d_x,*d_px,*d_py;
386: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
387: PetscBLASInt n,k,one=1;
388: PetscMPIInt len;
389: Vec z = y;
390: cublasStatus_t cberr;
391: cublasHandle_t cublasv2handle;
394: PetscBLASIntCast(X->n,&n);
395: PetscBLASIntCast(X->k-X->l,&k);
396: PetscCUBLASGetHandle(&cublasv2handle);
397: if (X->matrix) {
398: BV_IPMatMult(X,y);
399: z = X->Bx;
400: }
401: VecCUDAGetArrayRead(x->v,&d_px);
402: VecCUDAGetArrayRead(z,&d_py);
403: if (!q) {
404: VecCUDAGetArrayWrite(X->buffer,&d_work);
405: } else {
406: cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
407: }
408: d_A = d_px+(X->nc+X->l)*X->n;
409: d_x = d_py;
410: if (x->mpi) {
411: BVAllocateWork_Private(X,k);
412: if (n) {
413: PetscLogGpuTimeBegin();
414: #if defined(PETSC_USE_COMPLEX)
415: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
416: ConjugateCudaArray(d_work,k);
417: #else
418: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
419: #endif
420: PetscLogGpuTimeEnd();
421: cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
422: PetscLogGpuToCpu(k*sizeof(PetscScalar));
423: } else {
424: PetscArrayzero(X->work,k);
425: }
426: if (!q) {
427: VecCUDARestoreArrayWrite(X->buffer,&d_work);
428: VecGetArray(X->buffer,&qq);
429: } else {
430: cudaFree(d_work);
431: }
432: PetscMPIIntCast(k,&len);
433: MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
434: if (!q) { VecRestoreArray(X->buffer,&qq); }
435: } else {
436: if (n) {
437: PetscLogGpuTimeBegin();
438: #if defined(PETSC_USE_COMPLEX)
439: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
440: ConjugateCudaArray(d_work,k);
441: #else
442: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
443: #endif
444: PetscLogGpuTimeEnd();
445: }
446: if (!q) {
447: VecCUDARestoreArrayWrite(X->buffer,&d_work);
448: } else {
449: cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
450: PetscLogGpuToCpu(k*sizeof(PetscScalar));
451: cudaFree(d_work);
452: }
453: }
454: WaitForGPU();CHKERRCUDA(ierr);
455: VecCUDARestoreArrayRead(z,&d_py);
456: VecCUDARestoreArrayRead(x->v,&d_px);
457: PetscLogGpuFlops(2.0*n*k);
458: return(0);
459: }
461: /*
462: y := A'*x computed as y' := x'*A
463: */
464: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
465: {
466: PetscErrorCode ierr;
467: BV_SVEC *x = (BV_SVEC*)X->data;
468: const PetscScalar *d_A,*d_x,*d_px,*d_py;
469: PetscScalar *d_y,szero=0.0,sone=1.0;
470: PetscBLASInt n,k,one=1;
471: Vec z = y;
472: cublasStatus_t cberr;
473: cublasHandle_t cublasv2handle;
476: PetscBLASIntCast(X->n,&n);
477: PetscBLASIntCast(X->k-X->l,&k);
478: if (X->matrix) {
479: BV_IPMatMult(X,y);
480: z = X->Bx;
481: }
482: PetscCUBLASGetHandle(&cublasv2handle);
483: VecCUDAGetArrayRead(x->v,&d_px);
484: VecCUDAGetArrayRead(z,&d_py);
485: d_A = d_px+(X->nc+X->l)*X->n;
486: d_x = d_py;
487: if (n) {
488: cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
489: PetscLogGpuTimeBegin();
490: #if defined(PETSC_USE_COMPLEX)
491: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
492: ConjugateCudaArray(d_y,k);
493: #else
494: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495: #endif
496: PetscLogGpuTimeEnd();
497: WaitForGPU();CHKERRCUDA(ierr);
498: cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
499: PetscLogGpuToCpu(k*sizeof(PetscScalar));
500: cudaFree(d_y);
501: }
502: VecCUDARestoreArrayRead(z,&d_py);
503: VecCUDARestoreArrayRead(x->v,&d_px);
504: PetscLogGpuFlops(2.0*n*k);
505: return(0);
506: }
508: /*
509: Scale n scalars
510: */
511: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
512: {
514: BV_SVEC *ctx = (BV_SVEC*)bv->data;
515: PetscScalar *d_array, *d_A;
516: PetscBLASInt n,one=1;
517: cublasStatus_t cberr;
518: cublasHandle_t cublasv2handle;
521: VecCUDAGetArray(ctx->v,&d_array);
522: if (j<0) {
523: d_A = d_array+(bv->nc+bv->l)*bv->n;
524: PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
525: } else {
526: d_A = d_array+(bv->nc+j)*bv->n;
527: PetscBLASIntCast(bv->n,&n);
528: }
529: if (alpha == (PetscScalar)0.0) {
530: cudaMemset(d_A,0,n*sizeof(PetscScalar));
531: } else if (alpha != (PetscScalar)1.0) {
532: PetscCUBLASGetHandle(&cublasv2handle);
533: PetscLogGpuTimeBegin();
534: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
535: PetscLogGpuTimeEnd();
536: WaitForGPU();CHKERRCUDA(ierr);
537: PetscLogGpuFlops(1.0*n);
538: }
539: VecCUDARestoreArray(ctx->v,&d_array);
540: return(0);
541: }
543: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
544: {
545: PetscErrorCode ierr;
546: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
547: const PetscScalar *d_pv;
548: PetscScalar *d_pw;
549: PetscInt j;
552: VecCUDAGetArrayRead(v->v,&d_pv);
553: VecCUDAGetArrayWrite(w->v,&d_pw);
554: for (j=0;j<V->k-V->l;j++) {
555: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
556: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
557: MatMult(A,V->cv[1],W->cv[1]);
558: VecCUDAResetArray(V->cv[1]);
559: VecCUDAResetArray(W->cv[1]);
560: }
561: VecCUDARestoreArrayRead(v->v,&d_pv);
562: VecCUDARestoreArrayWrite(w->v,&d_pw);
563: return(0);
564: }
566: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
567: {
568: PetscErrorCode ierr;
569: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
570: const PetscScalar *d_pv,*d_pvc;
571: PetscScalar *d_pw,*d_pwc;
572: cudaError_t err;
575: VecCUDAGetArrayRead(v->v,&d_pv);
576: VecCUDAGetArrayWrite(w->v,&d_pw);
577: d_pvc = d_pv+(V->nc+V->l)*V->n;
578: d_pwc = d_pw+(W->nc+W->l)*W->n;
579: err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
580: VecCUDARestoreArrayRead(v->v,&d_pv);
581: VecCUDARestoreArrayWrite(w->v,&d_pw);
582: return(0);
583: }
585: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
586: {
588: BV_SVEC *v = (BV_SVEC*)V->data;
589: PetscScalar *d_pv;
590: cudaError_t err;
593: VecCUDAGetArray(v->v,&d_pv);
594: err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
595: VecCUDARestoreArray(v->v,&d_pv);
596: return(0);
597: }
599: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
600: {
601: PetscErrorCode ierr;
602: BV_SVEC *ctx = (BV_SVEC*)bv->data;
603: const PetscScalar *d_pv;
604: PetscScalar *d_pnew,*d_ptr;
605: PetscInt bs,lsplit;
606: Vec vnew,vpar;
607: char str[50];
608: cudaError_t err;
609: BV parent;
612: if (bv->issplit==2) {
613: parent = bv->splitparent;
614: lsplit = parent->lsplit;
615: vpar = ((BV_SVEC*)parent->data)->v;
616: VecCUDAResetArray(ctx->v);
617: VecCUDAGetArray(vpar,&d_ptr);
618: VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
619: VecCUDARestoreArray(vpar,&d_ptr);
620: } else if (!bv->issplit) {
621: VecGetBlockSize(bv->t,&bs);
622: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
623: VecSetType(vnew,((PetscObject)bv->t)->type_name);
624: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
625: VecSetBlockSize(vnew,bs);
626: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
627: if (((PetscObject)bv)->name) {
628: PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
629: PetscObjectSetName((PetscObject)vnew,str);
630: }
631: if (copy) {
632: VecCUDAGetArrayRead(ctx->v,&d_pv);
633: VecCUDAGetArrayWrite(vnew,&d_pnew);
634: err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
635: VecCUDARestoreArrayRead(ctx->v,&d_pv);
636: VecCUDARestoreArrayWrite(vnew,&d_pnew);
637: }
638: VecDestroy(&ctx->v);
639: ctx->v = vnew;
640: }
641: return(0);
642: }
644: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
645: {
647: BV_SVEC *ctx = (BV_SVEC*)bv->data;
648: PetscScalar *d_pv;
649: PetscInt l;
652: l = BVAvailableVec;
653: VecCUDAGetArray(ctx->v,&d_pv);
654: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
655: return(0);
656: }
658: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
659: {
661: BV_SVEC *ctx = (BV_SVEC*)bv->data;
662: PetscInt l;
665: l = (j==bv->ci[0])? 0: 1;
666: VecCUDAResetArray(bv->cv[l]);
667: VecCUDARestoreArray(ctx->v,NULL);
668: return(0);
669: }
671: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
672: {
673: PetscErrorCode ierr;
674: Vec v;
675: const PetscScalar *d_pv;
676: PetscObjectState lstate,rstate;
677: PetscBool change=PETSC_FALSE;
680: /* force sync flag to PETSC_CUDA_BOTH */
681: if (L) {
682: PetscObjectStateGet((PetscObject)*L,&lstate);
683: if (lstate != bv->lstate) {
684: v = ((BV_SVEC*)bv->L->data)->v;
685: VecCUDAGetArrayRead(v,&d_pv);
686: VecCUDARestoreArrayRead(v,&d_pv);
687: change = PETSC_TRUE;
688: }
689: }
690: if (R) {
691: PetscObjectStateGet((PetscObject)*R,&rstate);
692: if (rstate != bv->rstate) {
693: v = ((BV_SVEC*)bv->R->data)->v;
694: VecCUDAGetArrayRead(v,&d_pv);
695: VecCUDARestoreArrayRead(v,&d_pv);
696: change = PETSC_TRUE;
697: }
698: }
699: if (change) {
700: v = ((BV_SVEC*)bv->data)->v;
701: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
702: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
703: }
704: return(0);
705: }