Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
14: PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
15: const PetscInt *idx;
16: PetscInt start, end, *ai, *aj, bs, *nidx2;
17: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
26: PetscBTCreate(m, &table);
27: PetscMalloc1(m + 1, &nidx);
28: PetscMalloc1(A->rmap->N + 1, &nidx2);
30: for (i = 0; i < is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m, table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i], &idx);
37: ISGetLocalSize(is[i], &n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j = 0; j < n; ++j) {
41: ival = idx[j] / bs; /* convert the indices into block indices */
43: if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i], &idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j = 0; j < ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row + 1];
55: for (l = start; l < end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j = 0; j < isz; j++) {
63: for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return 0;
71: }
73: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
74: {
75: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
76: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
77: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
78: const PetscInt *irow, *icol;
79: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
80: PetscInt *aj = a->j, *ai = a->i;
81: MatScalar *mat_a;
82: Mat C;
83: PetscBool flag;
85: ISGetIndices(isrow, &irow);
86: ISGetIndices(iscol, &icol);
87: ISGetLocalSize(isrow, &nrows);
88: ISGetLocalSize(iscol, &ncols);
90: PetscCalloc1(1 + oldcols, &smap);
91: ssmap = smap;
92: PetscMalloc1(1 + nrows, &lens);
93: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94: /* determine lens of each row */
95: for (i = 0; i < nrows; i++) {
96: kstart = ai[irow[i]];
97: kend = kstart + a->ilen[irow[i]];
98: lens[i] = 0;
99: for (k = kstart; k < kend; k++) {
100: if (ssmap[aj[k]]) lens[i]++;
101: }
102: }
103: /* Create and fill new matrix */
104: if (scall == MAT_REUSE_MATRIX) {
105: c = (Mat_SeqBAIJ *)((*B)->data);
108: PetscArraycmp(c->ilen, lens, c->mbs, &flag);
110: PetscArrayzero(c->ilen, c->mbs);
111: C = *B;
112: } else {
113: MatCreate(PetscObjectComm((PetscObject)A), &C);
114: MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
115: MatSetType(C, ((PetscObject)A)->type_name);
116: MatSeqBAIJSetPreallocation(C, bs, 0, lens);
117: }
118: c = (Mat_SeqBAIJ *)(C->data);
119: for (i = 0; i < nrows; i++) {
120: row = irow[i];
121: kstart = ai[row];
122: kend = kstart + a->ilen[row];
123: mat_i = c->i[i];
124: mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
125: mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126: mat_ilen = c->ilen + i;
127: for (k = kstart; k < kend; k++) {
128: if ((tcol = ssmap[a->j[k]])) {
129: *mat_j++ = tcol - 1;
130: PetscArraycpy(mat_a, a->a + k * bs2, bs2);
131: mat_a += bs2;
132: (*mat_ilen)++;
133: }
134: }
135: }
136: /* sort */
137: if (c->j && c->a) {
138: MatScalar *work;
139: PetscMalloc1(bs2, &work);
140: for (i = 0; i < nrows; i++) {
141: PetscInt ilen;
142: mat_i = c->i[i];
143: mat_j = c->j + mat_i;
144: mat_a = c->a + mat_i * bs2;
145: ilen = c->ilen[i];
146: PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
147: }
148: PetscFree(work);
149: }
151: /* Free work space */
152: ISRestoreIndices(iscol, &icol);
153: PetscFree(smap);
154: PetscFree(lens);
155: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
156: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
158: ISRestoreIndices(isrow, &irow);
159: *B = C;
160: return 0;
161: }
163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
166: IS is1, is2;
167: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
168: const PetscInt *irow, *icol;
170: ISGetIndices(isrow, &irow);
171: ISGetIndices(iscol, &icol);
172: ISGetLocalSize(isrow, &nrows);
173: ISGetLocalSize(iscol, &ncols);
175: /* Verify if the indices corespond to each element in a block
176: and form the IS with compressed IS */
177: maxmnbs = PetscMax(a->mbs, a->nbs);
178: PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
179: PetscArrayzero(vary, a->mbs);
180: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
182: count = 0;
183: for (i = 0; i < nrows; i++) {
184: j = irow[i] / bs;
185: if ((vary[j]--) == bs) iary[count++] = j;
186: }
187: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);
189: PetscArrayzero(vary, a->nbs);
190: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
192: count = 0;
193: for (i = 0; i < ncols; i++) {
194: j = icol[i] / bs;
195: if ((vary[j]--) == bs) iary[count++] = j;
196: }
197: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
198: ISRestoreIndices(isrow, &irow);
199: ISRestoreIndices(iscol, &icol);
200: PetscFree2(vary, iary);
202: MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B);
203: ISDestroy(&is1);
204: ISDestroy(&is2);
205: return 0;
206: }
208: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
209: {
210: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
211: Mat_SubSppt *submatj = c->submatis1;
213: (*submatj->destroy)(C);
214: MatDestroySubMatrix_Private(submatj);
215: return 0;
216: }
218: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
219: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
220: {
221: PetscInt i;
222: Mat C;
223: Mat_SeqBAIJ *c;
224: Mat_SubSppt *submatj;
226: for (i = 0; i < n; i++) {
227: C = (*mat)[i];
228: c = (Mat_SeqBAIJ *)C->data;
229: submatj = c->submatis1;
230: if (submatj) {
231: if (--((PetscObject)C)->refct <= 0) {
232: PetscFree(C->factorprefix);
233: (*submatj->destroy)(C);
234: MatDestroySubMatrix_Private(submatj);
235: PetscFree(C->defaultvectype);
236: PetscFree(C->defaultrandtype);
237: PetscLayoutDestroy(&C->rmap);
238: PetscLayoutDestroy(&C->cmap);
239: PetscHeaderDestroy(&C);
240: }
241: } else {
242: MatDestroy(&C);
243: }
244: }
246: /* Destroy Dummy submatrices created for reuse */
247: MatDestroySubMatrices_Dummy(n, mat);
249: PetscFree(*mat);
250: return 0;
251: }
253: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
254: {
255: PetscInt i;
257: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);
259: for (i = 0; i < n; i++) MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
260: return 0;
261: }
263: /* -------------------------------------------------------*/
264: /* Should check that shapes of vectors and matrices match */
265: /* -------------------------------------------------------*/
267: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
268: {
269: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
270: PetscScalar *z, sum;
271: const PetscScalar *x;
272: const MatScalar *v;
273: PetscInt mbs, i, n;
274: const PetscInt *idx, *ii, *ridx = NULL;
275: PetscBool usecprow = a->compressedrow.use;
277: VecGetArrayRead(xx, &x);
278: VecGetArrayWrite(zz, &z);
280: if (usecprow) {
281: mbs = a->compressedrow.nrows;
282: ii = a->compressedrow.i;
283: ridx = a->compressedrow.rindex;
284: PetscArrayzero(z, a->mbs);
285: } else {
286: mbs = a->mbs;
287: ii = a->i;
288: }
290: for (i = 0; i < mbs; i++) {
291: n = ii[1] - ii[0];
292: v = a->a + ii[0];
293: idx = a->j + ii[0];
294: ii++;
295: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
296: PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297: sum = 0.0;
298: PetscSparseDensePlusDot(sum, x, v, idx, n);
299: if (usecprow) {
300: z[ridx[i]] = sum;
301: } else {
302: z[i] = sum;
303: }
304: }
305: VecRestoreArrayRead(xx, &x);
306: VecRestoreArrayWrite(zz, &z);
307: PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt);
308: return 0;
309: }
311: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
312: {
313: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
314: PetscScalar *z = NULL, sum1, sum2, *zarray;
315: const PetscScalar *x, *xb;
316: PetscScalar x1, x2;
317: const MatScalar *v;
318: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
319: PetscBool usecprow = a->compressedrow.use;
321: VecGetArrayRead(xx, &x);
322: VecGetArrayWrite(zz, &zarray);
324: idx = a->j;
325: v = a->a;
326: if (usecprow) {
327: mbs = a->compressedrow.nrows;
328: ii = a->compressedrow.i;
329: ridx = a->compressedrow.rindex;
330: PetscArrayzero(zarray, 2 * a->mbs);
331: } else {
332: mbs = a->mbs;
333: ii = a->i;
334: z = zarray;
335: }
337: for (i = 0; i < mbs; i++) {
338: n = ii[1] - ii[0];
339: ii++;
340: sum1 = 0.0;
341: sum2 = 0.0;
342: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
343: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
344: for (j = 0; j < n; j++) {
345: xb = x + 2 * (*idx++);
346: x1 = xb[0];
347: x2 = xb[1];
348: sum1 += v[0] * x1 + v[2] * x2;
349: sum2 += v[1] * x1 + v[3] * x2;
350: v += 4;
351: }
352: if (usecprow) z = zarray + 2 * ridx[i];
353: z[0] = sum1;
354: z[1] = sum2;
355: if (!usecprow) z += 2;
356: }
357: VecRestoreArrayRead(xx, &x);
358: VecRestoreArrayWrite(zz, &zarray);
359: PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt);
360: return 0;
361: }
363: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
364: {
365: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
366: PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
367: const PetscScalar *x, *xb;
368: const MatScalar *v;
369: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
370: PetscBool usecprow = a->compressedrow.use;
372: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
373: #pragma disjoint(*v, *z, *xb)
374: #endif
376: VecGetArrayRead(xx, &x);
377: VecGetArrayWrite(zz, &zarray);
379: idx = a->j;
380: v = a->a;
381: if (usecprow) {
382: mbs = a->compressedrow.nrows;
383: ii = a->compressedrow.i;
384: ridx = a->compressedrow.rindex;
385: PetscArrayzero(zarray, 3 * a->mbs);
386: } else {
387: mbs = a->mbs;
388: ii = a->i;
389: z = zarray;
390: }
392: for (i = 0; i < mbs; i++) {
393: n = ii[1] - ii[0];
394: ii++;
395: sum1 = 0.0;
396: sum2 = 0.0;
397: sum3 = 0.0;
398: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
399: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400: for (j = 0; j < n; j++) {
401: xb = x + 3 * (*idx++);
402: x1 = xb[0];
403: x2 = xb[1];
404: x3 = xb[2];
406: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409: v += 9;
410: }
411: if (usecprow) z = zarray + 3 * ridx[i];
412: z[0] = sum1;
413: z[1] = sum2;
414: z[2] = sum3;
415: if (!usecprow) z += 3;
416: }
417: VecRestoreArrayRead(xx, &x);
418: VecRestoreArrayWrite(zz, &zarray);
419: PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt);
420: return 0;
421: }
423: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
424: {
425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
426: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
427: const PetscScalar *x, *xb;
428: const MatScalar *v;
429: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
430: PetscBool usecprow = a->compressedrow.use;
432: VecGetArrayRead(xx, &x);
433: VecGetArrayWrite(zz, &zarray);
435: idx = a->j;
436: v = a->a;
437: if (usecprow) {
438: mbs = a->compressedrow.nrows;
439: ii = a->compressedrow.i;
440: ridx = a->compressedrow.rindex;
441: PetscArrayzero(zarray, 4 * a->mbs);
442: } else {
443: mbs = a->mbs;
444: ii = a->i;
445: z = zarray;
446: }
448: for (i = 0; i < mbs; i++) {
449: n = ii[1] - ii[0];
450: ii++;
451: sum1 = 0.0;
452: sum2 = 0.0;
453: sum3 = 0.0;
454: sum4 = 0.0;
456: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
457: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
458: for (j = 0; j < n; j++) {
459: xb = x + 4 * (*idx++);
460: x1 = xb[0];
461: x2 = xb[1];
462: x3 = xb[2];
463: x4 = xb[3];
464: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
465: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
466: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
467: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
468: v += 16;
469: }
470: if (usecprow) z = zarray + 4 * ridx[i];
471: z[0] = sum1;
472: z[1] = sum2;
473: z[2] = sum3;
474: z[3] = sum4;
475: if (!usecprow) z += 4;
476: }
477: VecRestoreArrayRead(xx, &x);
478: VecRestoreArrayWrite(zz, &zarray);
479: PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt);
480: return 0;
481: }
483: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
484: {
485: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
486: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
487: const PetscScalar *xb, *x;
488: const MatScalar *v;
489: const PetscInt *idx, *ii, *ridx = NULL;
490: PetscInt mbs, i, j, n;
491: PetscBool usecprow = a->compressedrow.use;
493: VecGetArrayRead(xx, &x);
494: VecGetArrayWrite(zz, &zarray);
496: idx = a->j;
497: v = a->a;
498: if (usecprow) {
499: mbs = a->compressedrow.nrows;
500: ii = a->compressedrow.i;
501: ridx = a->compressedrow.rindex;
502: PetscArrayzero(zarray, 5 * a->mbs);
503: } else {
504: mbs = a->mbs;
505: ii = a->i;
506: z = zarray;
507: }
509: for (i = 0; i < mbs; i++) {
510: n = ii[1] - ii[0];
511: ii++;
512: sum1 = 0.0;
513: sum2 = 0.0;
514: sum3 = 0.0;
515: sum4 = 0.0;
516: sum5 = 0.0;
517: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
518: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
519: for (j = 0; j < n; j++) {
520: xb = x + 5 * (*idx++);
521: x1 = xb[0];
522: x2 = xb[1];
523: x3 = xb[2];
524: x4 = xb[3];
525: x5 = xb[4];
526: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
527: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
528: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
529: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
530: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
531: v += 25;
532: }
533: if (usecprow) z = zarray + 5 * ridx[i];
534: z[0] = sum1;
535: z[1] = sum2;
536: z[2] = sum3;
537: z[3] = sum4;
538: z[4] = sum5;
539: if (!usecprow) z += 5;
540: }
541: VecRestoreArrayRead(xx, &x);
542: VecRestoreArrayWrite(zz, &zarray);
543: PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt);
544: return 0;
545: }
547: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
548: {
549: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
550: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
551: const PetscScalar *x, *xb;
552: PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
553: const MatScalar *v;
554: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
555: PetscBool usecprow = a->compressedrow.use;
557: VecGetArrayRead(xx, &x);
558: VecGetArrayWrite(zz, &zarray);
560: idx = a->j;
561: v = a->a;
562: if (usecprow) {
563: mbs = a->compressedrow.nrows;
564: ii = a->compressedrow.i;
565: ridx = a->compressedrow.rindex;
566: PetscArrayzero(zarray, 6 * a->mbs);
567: } else {
568: mbs = a->mbs;
569: ii = a->i;
570: z = zarray;
571: }
573: for (i = 0; i < mbs; i++) {
574: n = ii[1] - ii[0];
575: ii++;
576: sum1 = 0.0;
577: sum2 = 0.0;
578: sum3 = 0.0;
579: sum4 = 0.0;
580: sum5 = 0.0;
581: sum6 = 0.0;
583: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
584: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585: for (j = 0; j < n; j++) {
586: xb = x + 6 * (*idx++);
587: x1 = xb[0];
588: x2 = xb[1];
589: x3 = xb[2];
590: x4 = xb[3];
591: x5 = xb[4];
592: x6 = xb[5];
593: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
594: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
595: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
596: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
597: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
598: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
599: v += 36;
600: }
601: if (usecprow) z = zarray + 6 * ridx[i];
602: z[0] = sum1;
603: z[1] = sum2;
604: z[2] = sum3;
605: z[3] = sum4;
606: z[4] = sum5;
607: z[5] = sum6;
608: if (!usecprow) z += 6;
609: }
611: VecRestoreArrayRead(xx, &x);
612: VecRestoreArrayWrite(zz, &zarray);
613: PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt);
614: return 0;
615: }
617: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
618: {
619: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
620: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
621: const PetscScalar *x, *xb;
622: PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
623: const MatScalar *v;
624: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
625: PetscBool usecprow = a->compressedrow.use;
627: VecGetArrayRead(xx, &x);
628: VecGetArrayWrite(zz, &zarray);
630: idx = a->j;
631: v = a->a;
632: if (usecprow) {
633: mbs = a->compressedrow.nrows;
634: ii = a->compressedrow.i;
635: ridx = a->compressedrow.rindex;
636: PetscArrayzero(zarray, 7 * a->mbs);
637: } else {
638: mbs = a->mbs;
639: ii = a->i;
640: z = zarray;
641: }
643: for (i = 0; i < mbs; i++) {
644: n = ii[1] - ii[0];
645: ii++;
646: sum1 = 0.0;
647: sum2 = 0.0;
648: sum3 = 0.0;
649: sum4 = 0.0;
650: sum5 = 0.0;
651: sum6 = 0.0;
652: sum7 = 0.0;
654: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
655: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656: for (j = 0; j < n; j++) {
657: xb = x + 7 * (*idx++);
658: x1 = xb[0];
659: x2 = xb[1];
660: x3 = xb[2];
661: x4 = xb[3];
662: x5 = xb[4];
663: x6 = xb[5];
664: x7 = xb[6];
665: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
666: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
667: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
668: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
669: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
670: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
671: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
672: v += 49;
673: }
674: if (usecprow) z = zarray + 7 * ridx[i];
675: z[0] = sum1;
676: z[1] = sum2;
677: z[2] = sum3;
678: z[3] = sum4;
679: z[4] = sum5;
680: z[5] = sum6;
681: z[6] = sum7;
682: if (!usecprow) z += 7;
683: }
685: VecRestoreArrayRead(xx, &x);
686: VecRestoreArrayWrite(zz, &zarray);
687: PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt);
688: return 0;
689: }
691: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
692: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
693: {
694: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
695: PetscScalar *z = NULL, *work, *workt, *zarray;
696: const PetscScalar *x, *xb;
697: const MatScalar *v;
698: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
699: const PetscInt *idx, *ii, *ridx = NULL;
700: PetscInt k;
701: PetscBool usecprow = a->compressedrow.use;
703: __m256d a0, a1, a2, a3, a4, a5;
704: __m256d w0, w1, w2, w3;
705: __m256d z0, z1, z2;
706: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
708: VecGetArrayRead(xx, &x);
709: VecGetArrayWrite(zz, &zarray);
711: idx = a->j;
712: v = a->a;
713: if (usecprow) {
714: mbs = a->compressedrow.nrows;
715: ii = a->compressedrow.i;
716: ridx = a->compressedrow.rindex;
717: PetscArrayzero(zarray, bs * a->mbs);
718: } else {
719: mbs = a->mbs;
720: ii = a->i;
721: z = zarray;
722: }
724: if (!a->mult_work) {
725: k = PetscMax(A->rmap->n, A->cmap->n);
726: PetscMalloc1(k + 1, &a->mult_work);
727: }
729: work = a->mult_work;
730: for (i = 0; i < mbs; i++) {
731: n = ii[1] - ii[0];
732: ii++;
733: workt = work;
734: for (j = 0; j < n; j++) {
735: xb = x + bs * (*idx++);
736: for (k = 0; k < bs; k++) workt[k] = xb[k];
737: workt += bs;
738: }
739: if (usecprow) z = zarray + bs * ridx[i];
741: z0 = _mm256_setzero_pd();
742: z1 = _mm256_setzero_pd();
743: z2 = _mm256_setzero_pd();
745: for (j = 0; j < n; j++) {
746: /* first column of a */
747: w0 = _mm256_set1_pd(work[j * 9]);
748: a0 = _mm256_loadu_pd(&v[j * 81]);
749: z0 = _mm256_fmadd_pd(a0, w0, z0);
750: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
751: z1 = _mm256_fmadd_pd(a1, w0, z1);
752: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
753: z2 = _mm256_fmadd_pd(a2, w0, z2);
755: /* second column of a */
756: w1 = _mm256_set1_pd(work[j * 9 + 1]);
757: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
758: z0 = _mm256_fmadd_pd(a0, w1, z0);
759: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
760: z1 = _mm256_fmadd_pd(a1, w1, z1);
761: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
762: z2 = _mm256_fmadd_pd(a2, w1, z2);
764: /* third column of a */
765: w2 = _mm256_set1_pd(work[j * 9 + 2]);
766: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
767: z0 = _mm256_fmadd_pd(a3, w2, z0);
768: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
769: z1 = _mm256_fmadd_pd(a4, w2, z1);
770: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
771: z2 = _mm256_fmadd_pd(a5, w2, z2);
773: /* fourth column of a */
774: w3 = _mm256_set1_pd(work[j * 9 + 3]);
775: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
776: z0 = _mm256_fmadd_pd(a0, w3, z0);
777: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
778: z1 = _mm256_fmadd_pd(a1, w3, z1);
779: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
780: z2 = _mm256_fmadd_pd(a2, w3, z2);
782: /* fifth column of a */
783: w0 = _mm256_set1_pd(work[j * 9 + 4]);
784: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
785: z0 = _mm256_fmadd_pd(a3, w0, z0);
786: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
787: z1 = _mm256_fmadd_pd(a4, w0, z1);
788: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
789: z2 = _mm256_fmadd_pd(a5, w0, z2);
791: /* sixth column of a */
792: w1 = _mm256_set1_pd(work[j * 9 + 5]);
793: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
794: z0 = _mm256_fmadd_pd(a0, w1, z0);
795: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
796: z1 = _mm256_fmadd_pd(a1, w1, z1);
797: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
798: z2 = _mm256_fmadd_pd(a2, w1, z2);
800: /* seventh column of a */
801: w2 = _mm256_set1_pd(work[j * 9 + 6]);
802: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
803: z0 = _mm256_fmadd_pd(a0, w2, z0);
804: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
805: z1 = _mm256_fmadd_pd(a1, w2, z1);
806: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
807: z2 = _mm256_fmadd_pd(a2, w2, z2);
809: /* eighth column of a */
810: w3 = _mm256_set1_pd(work[j * 9 + 7]);
811: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
812: z0 = _mm256_fmadd_pd(a3, w3, z0);
813: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
814: z1 = _mm256_fmadd_pd(a4, w3, z1);
815: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
816: z2 = _mm256_fmadd_pd(a5, w3, z2);
818: /* ninth column of a */
819: w0 = _mm256_set1_pd(work[j * 9 + 8]);
820: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
821: z0 = _mm256_fmadd_pd(a0, w0, z0);
822: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
823: z1 = _mm256_fmadd_pd(a1, w0, z1);
824: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
825: z2 = _mm256_fmadd_pd(a2, w0, z2);
826: }
828: _mm256_storeu_pd(&z[0], z0);
829: _mm256_storeu_pd(&z[4], z1);
830: _mm256_maskstore_pd(&z[8], mask1, z2);
832: v += n * bs2;
833: if (!usecprow) z += bs;
834: }
835: VecRestoreArrayRead(xx, &x);
836: VecRestoreArrayWrite(zz, &zarray);
837: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
838: return 0;
839: }
840: #endif
842: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
843: {
844: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
845: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
846: const PetscScalar *x, *xb;
847: PetscScalar *zarray, xv;
848: const MatScalar *v;
849: const PetscInt *ii, *ij = a->j, *idx;
850: PetscInt mbs, i, j, k, n, *ridx = NULL;
851: PetscBool usecprow = a->compressedrow.use;
853: VecGetArrayRead(xx, &x);
854: VecGetArrayWrite(zz, &zarray);
856: v = a->a;
857: if (usecprow) {
858: mbs = a->compressedrow.nrows;
859: ii = a->compressedrow.i;
860: ridx = a->compressedrow.rindex;
861: PetscArrayzero(zarray, 11 * a->mbs);
862: } else {
863: mbs = a->mbs;
864: ii = a->i;
865: z = zarray;
866: }
868: for (i = 0; i < mbs; i++) {
869: n = ii[i + 1] - ii[i];
870: idx = ij + ii[i];
871: sum1 = 0.0;
872: sum2 = 0.0;
873: sum3 = 0.0;
874: sum4 = 0.0;
875: sum5 = 0.0;
876: sum6 = 0.0;
877: sum7 = 0.0;
878: sum8 = 0.0;
879: sum9 = 0.0;
880: sum10 = 0.0;
881: sum11 = 0.0;
883: for (j = 0; j < n; j++) {
884: xb = x + 11 * (idx[j]);
886: for (k = 0; k < 11; k++) {
887: xv = xb[k];
888: sum1 += v[0] * xv;
889: sum2 += v[1] * xv;
890: sum3 += v[2] * xv;
891: sum4 += v[3] * xv;
892: sum5 += v[4] * xv;
893: sum6 += v[5] * xv;
894: sum7 += v[6] * xv;
895: sum8 += v[7] * xv;
896: sum9 += v[8] * xv;
897: sum10 += v[9] * xv;
898: sum11 += v[10] * xv;
899: v += 11;
900: }
901: }
902: if (usecprow) z = zarray + 11 * ridx[i];
903: z[0] = sum1;
904: z[1] = sum2;
905: z[2] = sum3;
906: z[3] = sum4;
907: z[4] = sum5;
908: z[5] = sum6;
909: z[6] = sum7;
910: z[7] = sum8;
911: z[8] = sum9;
912: z[9] = sum10;
913: z[10] = sum11;
915: if (!usecprow) z += 11;
916: }
918: VecRestoreArrayRead(xx, &x);
919: VecRestoreArrayWrite(zz, &zarray);
920: PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt);
921: return 0;
922: }
924: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
925: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
926: {
927: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
928: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
929: const PetscScalar *x, *xb;
930: PetscScalar *zarray, xv;
931: const MatScalar *v;
932: const PetscInt *ii, *ij = a->j, *idx;
933: PetscInt mbs, i, j, k, n, *ridx = NULL;
934: PetscBool usecprow = a->compressedrow.use;
936: VecGetArrayRead(xx, &x);
937: VecGetArrayWrite(zz, &zarray);
939: v = a->a;
940: if (usecprow) {
941: mbs = a->compressedrow.nrows;
942: ii = a->compressedrow.i;
943: ridx = a->compressedrow.rindex;
944: PetscArrayzero(zarray, 12 * a->mbs);
945: } else {
946: mbs = a->mbs;
947: ii = a->i;
948: z = zarray;
949: }
951: for (i = 0; i < mbs; i++) {
952: n = ii[i + 1] - ii[i];
953: idx = ij + ii[i];
954: sum1 = 0.0;
955: sum2 = 0.0;
956: sum3 = 0.0;
957: sum4 = 0.0;
958: sum5 = 0.0;
959: sum6 = 0.0;
960: sum7 = 0.0;
961: sum8 = 0.0;
962: sum9 = 0.0;
963: sum10 = 0.0;
964: sum11 = 0.0;
965: sum12 = 0.0;
967: for (j = 0; j < n; j++) {
968: xb = x + 12 * (idx[j]);
970: for (k = 0; k < 12; k++) {
971: xv = xb[k];
972: sum1 += v[0] * xv;
973: sum2 += v[1] * xv;
974: sum3 += v[2] * xv;
975: sum4 += v[3] * xv;
976: sum5 += v[4] * xv;
977: sum6 += v[5] * xv;
978: sum7 += v[6] * xv;
979: sum8 += v[7] * xv;
980: sum9 += v[8] * xv;
981: sum10 += v[9] * xv;
982: sum11 += v[10] * xv;
983: sum12 += v[11] * xv;
984: v += 12;
985: }
986: }
987: if (usecprow) z = zarray + 12 * ridx[i];
988: z[0] = sum1;
989: z[1] = sum2;
990: z[2] = sum3;
991: z[3] = sum4;
992: z[4] = sum5;
993: z[5] = sum6;
994: z[6] = sum7;
995: z[7] = sum8;
996: z[8] = sum9;
997: z[9] = sum10;
998: z[10] = sum11;
999: z[11] = sum12;
1000: if (!usecprow) z += 12;
1001: }
1002: VecRestoreArrayRead(xx, &x);
1003: VecRestoreArrayWrite(zz, &zarray);
1004: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1005: return 0;
1006: }
1008: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1009: {
1010: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1011: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1012: const PetscScalar *x, *xb;
1013: PetscScalar *zarray, *yarray, xv;
1014: const MatScalar *v;
1015: const PetscInt *ii, *ij = a->j, *idx;
1016: PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
1017: PetscBool usecprow = a->compressedrow.use;
1019: VecGetArrayRead(xx, &x);
1020: VecGetArrayPair(yy, zz, &yarray, &zarray);
1022: v = a->a;
1023: if (usecprow) {
1024: if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1025: mbs = a->compressedrow.nrows;
1026: ii = a->compressedrow.i;
1027: ridx = a->compressedrow.rindex;
1028: } else {
1029: ii = a->i;
1030: y = yarray;
1031: z = zarray;
1032: }
1034: for (i = 0; i < mbs; i++) {
1035: n = ii[i + 1] - ii[i];
1036: idx = ij + ii[i];
1038: if (usecprow) {
1039: y = yarray + 12 * ridx[i];
1040: z = zarray + 12 * ridx[i];
1041: }
1042: sum1 = y[0];
1043: sum2 = y[1];
1044: sum3 = y[2];
1045: sum4 = y[3];
1046: sum5 = y[4];
1047: sum6 = y[5];
1048: sum7 = y[6];
1049: sum8 = y[7];
1050: sum9 = y[8];
1051: sum10 = y[9];
1052: sum11 = y[10];
1053: sum12 = y[11];
1055: for (j = 0; j < n; j++) {
1056: xb = x + 12 * (idx[j]);
1058: for (k = 0; k < 12; k++) {
1059: xv = xb[k];
1060: sum1 += v[0] * xv;
1061: sum2 += v[1] * xv;
1062: sum3 += v[2] * xv;
1063: sum4 += v[3] * xv;
1064: sum5 += v[4] * xv;
1065: sum6 += v[5] * xv;
1066: sum7 += v[6] * xv;
1067: sum8 += v[7] * xv;
1068: sum9 += v[8] * xv;
1069: sum10 += v[9] * xv;
1070: sum11 += v[10] * xv;
1071: sum12 += v[11] * xv;
1072: v += 12;
1073: }
1074: }
1076: z[0] = sum1;
1077: z[1] = sum2;
1078: z[2] = sum3;
1079: z[3] = sum4;
1080: z[4] = sum5;
1081: z[5] = sum6;
1082: z[6] = sum7;
1083: z[7] = sum8;
1084: z[8] = sum9;
1085: z[9] = sum10;
1086: z[10] = sum11;
1087: z[11] = sum12;
1088: if (!usecprow) {
1089: y += 12;
1090: z += 12;
1091: }
1092: }
1093: VecRestoreArrayRead(xx, &x);
1094: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1095: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1096: return 0;
1097: }
1099: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1100: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1101: {
1102: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1103: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1104: const PetscScalar *x, *xb;
1105: PetscScalar x1, x2, x3, x4, *zarray;
1106: const MatScalar *v;
1107: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1108: PetscInt mbs, i, j, n;
1109: PetscBool usecprow = a->compressedrow.use;
1111: VecGetArrayRead(xx, &x);
1112: VecGetArrayWrite(zz, &zarray);
1114: v = a->a;
1115: if (usecprow) {
1116: mbs = a->compressedrow.nrows;
1117: ii = a->compressedrow.i;
1118: ridx = a->compressedrow.rindex;
1119: PetscArrayzero(zarray, 12 * a->mbs);
1120: } else {
1121: mbs = a->mbs;
1122: ii = a->i;
1123: z = zarray;
1124: }
1126: for (i = 0; i < mbs; i++) {
1127: n = ii[i + 1] - ii[i];
1128: idx = ij + ii[i];
1130: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1131: for (j = 0; j < n; j++) {
1132: xb = x + 12 * (idx[j]);
1133: x1 = xb[0];
1134: x2 = xb[1];
1135: x3 = xb[2];
1136: x4 = xb[3];
1138: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150: v += 48;
1152: x1 = xb[4];
1153: x2 = xb[5];
1154: x3 = xb[6];
1155: x4 = xb[7];
1157: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1158: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1159: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1160: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1161: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1162: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1163: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1164: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1165: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1166: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1167: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1168: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1169: v += 48;
1171: x1 = xb[8];
1172: x2 = xb[9];
1173: x3 = xb[10];
1174: x4 = xb[11];
1175: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1176: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1177: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1178: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1179: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1180: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1181: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1182: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1183: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1184: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1185: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1186: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1187: v += 48;
1188: }
1189: if (usecprow) z = zarray + 12 * ridx[i];
1190: z[0] = sum1;
1191: z[1] = sum2;
1192: z[2] = sum3;
1193: z[3] = sum4;
1194: z[4] = sum5;
1195: z[5] = sum6;
1196: z[6] = sum7;
1197: z[7] = sum8;
1198: z[8] = sum9;
1199: z[9] = sum10;
1200: z[10] = sum11;
1201: z[11] = sum12;
1202: if (!usecprow) z += 12;
1203: }
1204: VecRestoreArrayRead(xx, &x);
1205: VecRestoreArrayWrite(zz, &zarray);
1206: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1207: return 0;
1208: }
1210: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1211: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1212: {
1213: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1214: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1215: const PetscScalar *x, *xb;
1216: PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1217: const MatScalar *v;
1218: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1219: PetscInt mbs = a->mbs, i, j, n;
1220: PetscBool usecprow = a->compressedrow.use;
1222: VecGetArrayRead(xx, &x);
1223: VecGetArrayPair(yy, zz, &yarray, &zarray);
1225: v = a->a;
1226: if (usecprow) {
1227: if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1228: mbs = a->compressedrow.nrows;
1229: ii = a->compressedrow.i;
1230: ridx = a->compressedrow.rindex;
1231: } else {
1232: ii = a->i;
1233: y = yarray;
1234: z = zarray;
1235: }
1237: for (i = 0; i < mbs; i++) {
1238: n = ii[i + 1] - ii[i];
1239: idx = ij + ii[i];
1241: if (usecprow) {
1242: y = yarray + 12 * ridx[i];
1243: z = zarray + 12 * ridx[i];
1244: }
1245: sum1 = y[0];
1246: sum2 = y[1];
1247: sum3 = y[2];
1248: sum4 = y[3];
1249: sum5 = y[4];
1250: sum6 = y[5];
1251: sum7 = y[6];
1252: sum8 = y[7];
1253: sum9 = y[8];
1254: sum10 = y[9];
1255: sum11 = y[10];
1256: sum12 = y[11];
1258: for (j = 0; j < n; j++) {
1259: xb = x + 12 * (idx[j]);
1260: x1 = xb[0];
1261: x2 = xb[1];
1262: x3 = xb[2];
1263: x4 = xb[3];
1265: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1266: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1267: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1268: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1269: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1270: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1271: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1272: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1273: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1274: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1275: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1276: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1277: v += 48;
1279: x1 = xb[4];
1280: x2 = xb[5];
1281: x3 = xb[6];
1282: x4 = xb[7];
1284: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296: v += 48;
1298: x1 = xb[8];
1299: x2 = xb[9];
1300: x3 = xb[10];
1301: x4 = xb[11];
1302: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1303: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1304: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1305: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1306: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1307: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1308: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1309: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1310: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1311: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1312: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1313: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1314: v += 48;
1315: }
1316: z[0] = sum1;
1317: z[1] = sum2;
1318: z[2] = sum3;
1319: z[3] = sum4;
1320: z[4] = sum5;
1321: z[5] = sum6;
1322: z[6] = sum7;
1323: z[7] = sum8;
1324: z[8] = sum9;
1325: z[9] = sum10;
1326: z[10] = sum11;
1327: z[11] = sum12;
1328: if (!usecprow) {
1329: y += 12;
1330: z += 12;
1331: }
1332: }
1333: VecRestoreArrayRead(xx, &x);
1334: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1335: PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1336: return 0;
1337: }
1339: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1340: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1341: {
1342: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1343: PetscScalar *z = NULL, *zarray;
1344: const PetscScalar *x, *work;
1345: const MatScalar *v = a->a;
1346: PetscInt mbs, i, j, n;
1347: const PetscInt *idx = a->j, *ii, *ridx = NULL;
1348: PetscBool usecprow = a->compressedrow.use;
1349: const PetscInt bs = 12, bs2 = 144;
1351: __m256d a0, a1, a2, a3, a4, a5;
1352: __m256d w0, w1, w2, w3;
1353: __m256d z0, z1, z2;
1355: VecGetArrayRead(xx, &x);
1356: VecGetArrayWrite(zz, &zarray);
1358: if (usecprow) {
1359: mbs = a->compressedrow.nrows;
1360: ii = a->compressedrow.i;
1361: ridx = a->compressedrow.rindex;
1362: PetscArrayzero(zarray, bs * a->mbs);
1363: } else {
1364: mbs = a->mbs;
1365: ii = a->i;
1366: z = zarray;
1367: }
1369: for (i = 0; i < mbs; i++) {
1370: z0 = _mm256_setzero_pd();
1371: z1 = _mm256_setzero_pd();
1372: z2 = _mm256_setzero_pd();
1374: n = ii[1] - ii[0];
1375: ii++;
1376: for (j = 0; j < n; j++) {
1377: work = x + bs * (*idx++);
1379: /* first column of a */
1380: w0 = _mm256_set1_pd(work[0]);
1381: a0 = _mm256_loadu_pd(v + 0);
1382: z0 = _mm256_fmadd_pd(a0, w0, z0);
1383: a1 = _mm256_loadu_pd(v + 4);
1384: z1 = _mm256_fmadd_pd(a1, w0, z1);
1385: a2 = _mm256_loadu_pd(v + 8);
1386: z2 = _mm256_fmadd_pd(a2, w0, z2);
1388: /* second column of a */
1389: w1 = _mm256_set1_pd(work[1]);
1390: a3 = _mm256_loadu_pd(v + 12);
1391: z0 = _mm256_fmadd_pd(a3, w1, z0);
1392: a4 = _mm256_loadu_pd(v + 16);
1393: z1 = _mm256_fmadd_pd(a4, w1, z1);
1394: a5 = _mm256_loadu_pd(v + 20);
1395: z2 = _mm256_fmadd_pd(a5, w1, z2);
1397: /* third column of a */
1398: w2 = _mm256_set1_pd(work[2]);
1399: a0 = _mm256_loadu_pd(v + 24);
1400: z0 = _mm256_fmadd_pd(a0, w2, z0);
1401: a1 = _mm256_loadu_pd(v + 28);
1402: z1 = _mm256_fmadd_pd(a1, w2, z1);
1403: a2 = _mm256_loadu_pd(v + 32);
1404: z2 = _mm256_fmadd_pd(a2, w2, z2);
1406: /* fourth column of a */
1407: w3 = _mm256_set1_pd(work[3]);
1408: a3 = _mm256_loadu_pd(v + 36);
1409: z0 = _mm256_fmadd_pd(a3, w3, z0);
1410: a4 = _mm256_loadu_pd(v + 40);
1411: z1 = _mm256_fmadd_pd(a4, w3, z1);
1412: a5 = _mm256_loadu_pd(v + 44);
1413: z2 = _mm256_fmadd_pd(a5, w3, z2);
1415: /* fifth column of a */
1416: w0 = _mm256_set1_pd(work[4]);
1417: a0 = _mm256_loadu_pd(v + 48);
1418: z0 = _mm256_fmadd_pd(a0, w0, z0);
1419: a1 = _mm256_loadu_pd(v + 52);
1420: z1 = _mm256_fmadd_pd(a1, w0, z1);
1421: a2 = _mm256_loadu_pd(v + 56);
1422: z2 = _mm256_fmadd_pd(a2, w0, z2);
1424: /* sixth column of a */
1425: w1 = _mm256_set1_pd(work[5]);
1426: a3 = _mm256_loadu_pd(v + 60);
1427: z0 = _mm256_fmadd_pd(a3, w1, z0);
1428: a4 = _mm256_loadu_pd(v + 64);
1429: z1 = _mm256_fmadd_pd(a4, w1, z1);
1430: a5 = _mm256_loadu_pd(v + 68);
1431: z2 = _mm256_fmadd_pd(a5, w1, z2);
1433: /* seventh column of a */
1434: w2 = _mm256_set1_pd(work[6]);
1435: a0 = _mm256_loadu_pd(v + 72);
1436: z0 = _mm256_fmadd_pd(a0, w2, z0);
1437: a1 = _mm256_loadu_pd(v + 76);
1438: z1 = _mm256_fmadd_pd(a1, w2, z1);
1439: a2 = _mm256_loadu_pd(v + 80);
1440: z2 = _mm256_fmadd_pd(a2, w2, z2);
1442: /* eighth column of a */
1443: w3 = _mm256_set1_pd(work[7]);
1444: a3 = _mm256_loadu_pd(v + 84);
1445: z0 = _mm256_fmadd_pd(a3, w3, z0);
1446: a4 = _mm256_loadu_pd(v + 88);
1447: z1 = _mm256_fmadd_pd(a4, w3, z1);
1448: a5 = _mm256_loadu_pd(v + 92);
1449: z2 = _mm256_fmadd_pd(a5, w3, z2);
1451: /* ninth column of a */
1452: w0 = _mm256_set1_pd(work[8]);
1453: a0 = _mm256_loadu_pd(v + 96);
1454: z0 = _mm256_fmadd_pd(a0, w0, z0);
1455: a1 = _mm256_loadu_pd(v + 100);
1456: z1 = _mm256_fmadd_pd(a1, w0, z1);
1457: a2 = _mm256_loadu_pd(v + 104);
1458: z2 = _mm256_fmadd_pd(a2, w0, z2);
1460: /* tenth column of a */
1461: w1 = _mm256_set1_pd(work[9]);
1462: a3 = _mm256_loadu_pd(v + 108);
1463: z0 = _mm256_fmadd_pd(a3, w1, z0);
1464: a4 = _mm256_loadu_pd(v + 112);
1465: z1 = _mm256_fmadd_pd(a4, w1, z1);
1466: a5 = _mm256_loadu_pd(v + 116);
1467: z2 = _mm256_fmadd_pd(a5, w1, z2);
1469: /* eleventh column of a */
1470: w2 = _mm256_set1_pd(work[10]);
1471: a0 = _mm256_loadu_pd(v + 120);
1472: z0 = _mm256_fmadd_pd(a0, w2, z0);
1473: a1 = _mm256_loadu_pd(v + 124);
1474: z1 = _mm256_fmadd_pd(a1, w2, z1);
1475: a2 = _mm256_loadu_pd(v + 128);
1476: z2 = _mm256_fmadd_pd(a2, w2, z2);
1478: /* twelveth column of a */
1479: w3 = _mm256_set1_pd(work[11]);
1480: a3 = _mm256_loadu_pd(v + 132);
1481: z0 = _mm256_fmadd_pd(a3, w3, z0);
1482: a4 = _mm256_loadu_pd(v + 136);
1483: z1 = _mm256_fmadd_pd(a4, w3, z1);
1484: a5 = _mm256_loadu_pd(v + 140);
1485: z2 = _mm256_fmadd_pd(a5, w3, z2);
1487: v += bs2;
1488: }
1489: if (usecprow) z = zarray + bs * ridx[i];
1490: _mm256_storeu_pd(&z[0], z0);
1491: _mm256_storeu_pd(&z[4], z1);
1492: _mm256_storeu_pd(&z[8], z2);
1493: if (!usecprow) z += bs;
1494: }
1495: VecRestoreArrayRead(xx, &x);
1496: VecRestoreArrayWrite(zz, &zarray);
1497: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
1498: return 0;
1499: }
1500: #endif
1502: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1503: /* Default MatMult for block size 15 */
1504: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1505: {
1506: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1507: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1508: const PetscScalar *x, *xb;
1509: PetscScalar *zarray, xv;
1510: const MatScalar *v;
1511: const PetscInt *ii, *ij = a->j, *idx;
1512: PetscInt mbs, i, j, k, n, *ridx = NULL;
1513: PetscBool usecprow = a->compressedrow.use;
1515: VecGetArrayRead(xx, &x);
1516: VecGetArrayWrite(zz, &zarray);
1518: v = a->a;
1519: if (usecprow) {
1520: mbs = a->compressedrow.nrows;
1521: ii = a->compressedrow.i;
1522: ridx = a->compressedrow.rindex;
1523: PetscArrayzero(zarray, 15 * a->mbs);
1524: } else {
1525: mbs = a->mbs;
1526: ii = a->i;
1527: z = zarray;
1528: }
1530: for (i = 0; i < mbs; i++) {
1531: n = ii[i + 1] - ii[i];
1532: idx = ij + ii[i];
1533: sum1 = 0.0;
1534: sum2 = 0.0;
1535: sum3 = 0.0;
1536: sum4 = 0.0;
1537: sum5 = 0.0;
1538: sum6 = 0.0;
1539: sum7 = 0.0;
1540: sum8 = 0.0;
1541: sum9 = 0.0;
1542: sum10 = 0.0;
1543: sum11 = 0.0;
1544: sum12 = 0.0;
1545: sum13 = 0.0;
1546: sum14 = 0.0;
1547: sum15 = 0.0;
1549: for (j = 0; j < n; j++) {
1550: xb = x + 15 * (idx[j]);
1552: for (k = 0; k < 15; k++) {
1553: xv = xb[k];
1554: sum1 += v[0] * xv;
1555: sum2 += v[1] * xv;
1556: sum3 += v[2] * xv;
1557: sum4 += v[3] * xv;
1558: sum5 += v[4] * xv;
1559: sum6 += v[5] * xv;
1560: sum7 += v[6] * xv;
1561: sum8 += v[7] * xv;
1562: sum9 += v[8] * xv;
1563: sum10 += v[9] * xv;
1564: sum11 += v[10] * xv;
1565: sum12 += v[11] * xv;
1566: sum13 += v[12] * xv;
1567: sum14 += v[13] * xv;
1568: sum15 += v[14] * xv;
1569: v += 15;
1570: }
1571: }
1572: if (usecprow) z = zarray + 15 * ridx[i];
1573: z[0] = sum1;
1574: z[1] = sum2;
1575: z[2] = sum3;
1576: z[3] = sum4;
1577: z[4] = sum5;
1578: z[5] = sum6;
1579: z[6] = sum7;
1580: z[7] = sum8;
1581: z[8] = sum9;
1582: z[9] = sum10;
1583: z[10] = sum11;
1584: z[11] = sum12;
1585: z[12] = sum13;
1586: z[13] = sum14;
1587: z[14] = sum15;
1589: if (!usecprow) z += 15;
1590: }
1592: VecRestoreArrayRead(xx, &x);
1593: VecRestoreArrayWrite(zz, &zarray);
1594: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1595: return 0;
1596: }
1598: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1599: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1600: {
1601: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1602: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1603: const PetscScalar *x, *xb;
1604: PetscScalar x1, x2, x3, x4, *zarray;
1605: const MatScalar *v;
1606: const PetscInt *ii, *ij = a->j, *idx;
1607: PetscInt mbs, i, j, n, *ridx = NULL;
1608: PetscBool usecprow = a->compressedrow.use;
1610: VecGetArrayRead(xx, &x);
1611: VecGetArrayWrite(zz, &zarray);
1613: v = a->a;
1614: if (usecprow) {
1615: mbs = a->compressedrow.nrows;
1616: ii = a->compressedrow.i;
1617: ridx = a->compressedrow.rindex;
1618: PetscArrayzero(zarray, 15 * a->mbs);
1619: } else {
1620: mbs = a->mbs;
1621: ii = a->i;
1622: z = zarray;
1623: }
1625: for (i = 0; i < mbs; i++) {
1626: n = ii[i + 1] - ii[i];
1627: idx = ij + ii[i];
1628: sum1 = 0.0;
1629: sum2 = 0.0;
1630: sum3 = 0.0;
1631: sum4 = 0.0;
1632: sum5 = 0.0;
1633: sum6 = 0.0;
1634: sum7 = 0.0;
1635: sum8 = 0.0;
1636: sum9 = 0.0;
1637: sum10 = 0.0;
1638: sum11 = 0.0;
1639: sum12 = 0.0;
1640: sum13 = 0.0;
1641: sum14 = 0.0;
1642: sum15 = 0.0;
1644: for (j = 0; j < n; j++) {
1645: xb = x + 15 * (idx[j]);
1646: x1 = xb[0];
1647: x2 = xb[1];
1648: x3 = xb[2];
1649: x4 = xb[3];
1651: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1652: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1653: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1654: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1655: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1656: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1657: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1658: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1659: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1660: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1661: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1662: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1663: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1664: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1665: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1667: v += 60;
1669: x1 = xb[4];
1670: x2 = xb[5];
1671: x3 = xb[6];
1672: x4 = xb[7];
1674: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1675: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1676: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1677: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1678: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1679: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1680: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1681: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1682: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1683: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1684: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1685: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1686: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1687: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1688: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1689: v += 60;
1691: x1 = xb[8];
1692: x2 = xb[9];
1693: x3 = xb[10];
1694: x4 = xb[11];
1695: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1696: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1697: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1698: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1699: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1700: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1701: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1702: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1703: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1704: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1705: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1706: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1707: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1708: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1709: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1710: v += 60;
1712: x1 = xb[12];
1713: x2 = xb[13];
1714: x3 = xb[14];
1715: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1716: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1717: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1718: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1719: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1720: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1721: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1722: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1723: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1724: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1725: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1726: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1727: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1728: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1729: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1730: v += 45;
1731: }
1732: if (usecprow) z = zarray + 15 * ridx[i];
1733: z[0] = sum1;
1734: z[1] = sum2;
1735: z[2] = sum3;
1736: z[3] = sum4;
1737: z[4] = sum5;
1738: z[5] = sum6;
1739: z[6] = sum7;
1740: z[7] = sum8;
1741: z[8] = sum9;
1742: z[9] = sum10;
1743: z[10] = sum11;
1744: z[11] = sum12;
1745: z[12] = sum13;
1746: z[13] = sum14;
1747: z[14] = sum15;
1749: if (!usecprow) z += 15;
1750: }
1752: VecRestoreArrayRead(xx, &x);
1753: VecRestoreArrayWrite(zz, &zarray);
1754: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1755: return 0;
1756: }
1758: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1759: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1760: {
1761: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1762: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1763: const PetscScalar *x, *xb;
1764: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1765: const MatScalar *v;
1766: const PetscInt *ii, *ij = a->j, *idx;
1767: PetscInt mbs, i, j, n, *ridx = NULL;
1768: PetscBool usecprow = a->compressedrow.use;
1770: VecGetArrayRead(xx, &x);
1771: VecGetArrayWrite(zz, &zarray);
1773: v = a->a;
1774: if (usecprow) {
1775: mbs = a->compressedrow.nrows;
1776: ii = a->compressedrow.i;
1777: ridx = a->compressedrow.rindex;
1778: PetscArrayzero(zarray, 15 * a->mbs);
1779: } else {
1780: mbs = a->mbs;
1781: ii = a->i;
1782: z = zarray;
1783: }
1785: for (i = 0; i < mbs; i++) {
1786: n = ii[i + 1] - ii[i];
1787: idx = ij + ii[i];
1788: sum1 = 0.0;
1789: sum2 = 0.0;
1790: sum3 = 0.0;
1791: sum4 = 0.0;
1792: sum5 = 0.0;
1793: sum6 = 0.0;
1794: sum7 = 0.0;
1795: sum8 = 0.0;
1796: sum9 = 0.0;
1797: sum10 = 0.0;
1798: sum11 = 0.0;
1799: sum12 = 0.0;
1800: sum13 = 0.0;
1801: sum14 = 0.0;
1802: sum15 = 0.0;
1804: for (j = 0; j < n; j++) {
1805: xb = x + 15 * (idx[j]);
1806: x1 = xb[0];
1807: x2 = xb[1];
1808: x3 = xb[2];
1809: x4 = xb[3];
1810: x5 = xb[4];
1811: x6 = xb[5];
1812: x7 = xb[6];
1813: x8 = xb[7];
1815: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1816: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1817: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1818: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1819: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1820: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1821: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1822: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1823: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1824: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1825: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1826: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1827: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1828: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1829: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1830: v += 120;
1832: x1 = xb[8];
1833: x2 = xb[9];
1834: x3 = xb[10];
1835: x4 = xb[11];
1836: x5 = xb[12];
1837: x6 = xb[13];
1838: x7 = xb[14];
1840: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1841: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1842: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1843: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1844: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1845: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1846: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1847: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1848: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1849: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1850: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1851: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1852: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1853: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1854: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1855: v += 105;
1856: }
1857: if (usecprow) z = zarray + 15 * ridx[i];
1858: z[0] = sum1;
1859: z[1] = sum2;
1860: z[2] = sum3;
1861: z[3] = sum4;
1862: z[4] = sum5;
1863: z[5] = sum6;
1864: z[6] = sum7;
1865: z[7] = sum8;
1866: z[8] = sum9;
1867: z[9] = sum10;
1868: z[10] = sum11;
1869: z[11] = sum12;
1870: z[12] = sum13;
1871: z[13] = sum14;
1872: z[14] = sum15;
1874: if (!usecprow) z += 15;
1875: }
1877: VecRestoreArrayRead(xx, &x);
1878: VecRestoreArrayWrite(zz, &zarray);
1879: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1880: return 0;
1881: }
1883: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1884: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1885: {
1886: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1887: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1888: const PetscScalar *x, *xb;
1889: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1890: const MatScalar *v;
1891: const PetscInt *ii, *ij = a->j, *idx;
1892: PetscInt mbs, i, j, n, *ridx = NULL;
1893: PetscBool usecprow = a->compressedrow.use;
1895: VecGetArrayRead(xx, &x);
1896: VecGetArrayWrite(zz, &zarray);
1898: v = a->a;
1899: if (usecprow) {
1900: mbs = a->compressedrow.nrows;
1901: ii = a->compressedrow.i;
1902: ridx = a->compressedrow.rindex;
1903: PetscArrayzero(zarray, 15 * a->mbs);
1904: } else {
1905: mbs = a->mbs;
1906: ii = a->i;
1907: z = zarray;
1908: }
1910: for (i = 0; i < mbs; i++) {
1911: n = ii[i + 1] - ii[i];
1912: idx = ij + ii[i];
1913: sum1 = 0.0;
1914: sum2 = 0.0;
1915: sum3 = 0.0;
1916: sum4 = 0.0;
1917: sum5 = 0.0;
1918: sum6 = 0.0;
1919: sum7 = 0.0;
1920: sum8 = 0.0;
1921: sum9 = 0.0;
1922: sum10 = 0.0;
1923: sum11 = 0.0;
1924: sum12 = 0.0;
1925: sum13 = 0.0;
1926: sum14 = 0.0;
1927: sum15 = 0.0;
1929: for (j = 0; j < n; j++) {
1930: xb = x + 15 * (idx[j]);
1931: x1 = xb[0];
1932: x2 = xb[1];
1933: x3 = xb[2];
1934: x4 = xb[3];
1935: x5 = xb[4];
1936: x6 = xb[5];
1937: x7 = xb[6];
1938: x8 = xb[7];
1939: x9 = xb[8];
1940: x10 = xb[9];
1941: x11 = xb[10];
1942: x12 = xb[11];
1943: x13 = xb[12];
1944: x14 = xb[13];
1945: x15 = xb[14];
1947: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1948: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1949: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1950: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1951: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1952: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1953: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1954: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1955: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1956: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1957: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1958: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1959: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1960: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1961: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1962: v += 225;
1963: }
1964: if (usecprow) z = zarray + 15 * ridx[i];
1965: z[0] = sum1;
1966: z[1] = sum2;
1967: z[2] = sum3;
1968: z[3] = sum4;
1969: z[4] = sum5;
1970: z[5] = sum6;
1971: z[6] = sum7;
1972: z[7] = sum8;
1973: z[8] = sum9;
1974: z[9] = sum10;
1975: z[10] = sum11;
1976: z[11] = sum12;
1977: z[12] = sum13;
1978: z[13] = sum14;
1979: z[14] = sum15;
1981: if (!usecprow) z += 15;
1982: }
1984: VecRestoreArrayRead(xx, &x);
1985: VecRestoreArrayWrite(zz, &zarray);
1986: PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1987: return 0;
1988: }
1990: /*
1991: This will not work with MatScalar == float because it calls the BLAS
1992: */
1993: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1994: {
1995: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1996: PetscScalar *z = NULL, *work, *workt, *zarray;
1997: const PetscScalar *x, *xb;
1998: const MatScalar *v;
1999: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2000: const PetscInt *idx, *ii, *ridx = NULL;
2001: PetscInt ncols, k;
2002: PetscBool usecprow = a->compressedrow.use;
2004: VecGetArrayRead(xx, &x);
2005: VecGetArrayWrite(zz, &zarray);
2007: idx = a->j;
2008: v = a->a;
2009: if (usecprow) {
2010: mbs = a->compressedrow.nrows;
2011: ii = a->compressedrow.i;
2012: ridx = a->compressedrow.rindex;
2013: PetscArrayzero(zarray, bs * a->mbs);
2014: } else {
2015: mbs = a->mbs;
2016: ii = a->i;
2017: z = zarray;
2018: }
2020: if (!a->mult_work) {
2021: k = PetscMax(A->rmap->n, A->cmap->n);
2022: PetscMalloc1(k + 1, &a->mult_work);
2023: }
2024: work = a->mult_work;
2025: for (i = 0; i < mbs; i++) {
2026: n = ii[1] - ii[0];
2027: ii++;
2028: ncols = n * bs;
2029: workt = work;
2030: for (j = 0; j < n; j++) {
2031: xb = x + bs * (*idx++);
2032: for (k = 0; k < bs; k++) workt[k] = xb[k];
2033: workt += bs;
2034: }
2035: if (usecprow) z = zarray + bs * ridx[i];
2036: PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2037: v += n * bs2;
2038: if (!usecprow) z += bs;
2039: }
2040: VecRestoreArrayRead(xx, &x);
2041: VecRestoreArrayWrite(zz, &zarray);
2042: PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
2043: return 0;
2044: }
2046: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2047: {
2048: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2049: const PetscScalar *x;
2050: PetscScalar *y, *z, sum;
2051: const MatScalar *v;
2052: PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2053: const PetscInt *idx, *ii;
2054: PetscBool usecprow = a->compressedrow.use;
2056: VecGetArrayRead(xx, &x);
2057: VecGetArrayPair(yy, zz, &y, &z);
2059: idx = a->j;
2060: v = a->a;
2061: if (usecprow) {
2062: if (zz != yy) PetscArraycpy(z, y, mbs);
2063: mbs = a->compressedrow.nrows;
2064: ii = a->compressedrow.i;
2065: ridx = a->compressedrow.rindex;
2066: } else {
2067: ii = a->i;
2068: }
2070: for (i = 0; i < mbs; i++) {
2071: n = ii[1] - ii[0];
2072: ii++;
2073: if (!usecprow) {
2074: sum = y[i];
2075: } else {
2076: sum = y[ridx[i]];
2077: }
2078: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2079: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2080: PetscSparseDensePlusDot(sum, x, v, idx, n);
2081: v += n;
2082: idx += n;
2083: if (usecprow) {
2084: z[ridx[i]] = sum;
2085: } else {
2086: z[i] = sum;
2087: }
2088: }
2089: VecRestoreArrayRead(xx, &x);
2090: VecRestoreArrayPair(yy, zz, &y, &z);
2091: PetscLogFlops(2.0 * a->nz);
2092: return 0;
2093: }
2095: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2096: {
2097: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2098: PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2099: const PetscScalar *x, *xb;
2100: PetscScalar x1, x2, *yarray, *zarray;
2101: const MatScalar *v;
2102: PetscInt mbs = a->mbs, i, n, j;
2103: const PetscInt *idx, *ii, *ridx = NULL;
2104: PetscBool usecprow = a->compressedrow.use;
2106: VecGetArrayRead(xx, &x);
2107: VecGetArrayPair(yy, zz, &yarray, &zarray);
2109: idx = a->j;
2110: v = a->a;
2111: if (usecprow) {
2112: if (zz != yy) PetscArraycpy(zarray, yarray, 2 * mbs);
2113: mbs = a->compressedrow.nrows;
2114: ii = a->compressedrow.i;
2115: ridx = a->compressedrow.rindex;
2116: } else {
2117: ii = a->i;
2118: y = yarray;
2119: z = zarray;
2120: }
2122: for (i = 0; i < mbs; i++) {
2123: n = ii[1] - ii[0];
2124: ii++;
2125: if (usecprow) {
2126: z = zarray + 2 * ridx[i];
2127: y = yarray + 2 * ridx[i];
2128: }
2129: sum1 = y[0];
2130: sum2 = y[1];
2131: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2132: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2133: for (j = 0; j < n; j++) {
2134: xb = x + 2 * (*idx++);
2135: x1 = xb[0];
2136: x2 = xb[1];
2138: sum1 += v[0] * x1 + v[2] * x2;
2139: sum2 += v[1] * x1 + v[3] * x2;
2140: v += 4;
2141: }
2142: z[0] = sum1;
2143: z[1] = sum2;
2144: if (!usecprow) {
2145: z += 2;
2146: y += 2;
2147: }
2148: }
2149: VecRestoreArrayRead(xx, &x);
2150: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2151: PetscLogFlops(4.0 * a->nz);
2152: return 0;
2153: }
2155: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2156: {
2157: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2158: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2159: const PetscScalar *x, *xb;
2160: const MatScalar *v;
2161: PetscInt mbs = a->mbs, i, j, n;
2162: const PetscInt *idx, *ii, *ridx = NULL;
2163: PetscBool usecprow = a->compressedrow.use;
2165: VecGetArrayRead(xx, &x);
2166: VecGetArrayPair(yy, zz, &yarray, &zarray);
2168: idx = a->j;
2169: v = a->a;
2170: if (usecprow) {
2171: if (zz != yy) PetscArraycpy(zarray, yarray, 3 * mbs);
2172: mbs = a->compressedrow.nrows;
2173: ii = a->compressedrow.i;
2174: ridx = a->compressedrow.rindex;
2175: } else {
2176: ii = a->i;
2177: y = yarray;
2178: z = zarray;
2179: }
2181: for (i = 0; i < mbs; i++) {
2182: n = ii[1] - ii[0];
2183: ii++;
2184: if (usecprow) {
2185: z = zarray + 3 * ridx[i];
2186: y = yarray + 3 * ridx[i];
2187: }
2188: sum1 = y[0];
2189: sum2 = y[1];
2190: sum3 = y[2];
2191: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2192: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2193: for (j = 0; j < n; j++) {
2194: xb = x + 3 * (*idx++);
2195: x1 = xb[0];
2196: x2 = xb[1];
2197: x3 = xb[2];
2198: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2199: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2200: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2201: v += 9;
2202: }
2203: z[0] = sum1;
2204: z[1] = sum2;
2205: z[2] = sum3;
2206: if (!usecprow) {
2207: z += 3;
2208: y += 3;
2209: }
2210: }
2211: VecRestoreArrayRead(xx, &x);
2212: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2213: PetscLogFlops(18.0 * a->nz);
2214: return 0;
2215: }
2217: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2218: {
2219: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2220: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2221: const PetscScalar *x, *xb;
2222: const MatScalar *v;
2223: PetscInt mbs = a->mbs, i, j, n;
2224: const PetscInt *idx, *ii, *ridx = NULL;
2225: PetscBool usecprow = a->compressedrow.use;
2227: VecGetArrayRead(xx, &x);
2228: VecGetArrayPair(yy, zz, &yarray, &zarray);
2230: idx = a->j;
2231: v = a->a;
2232: if (usecprow) {
2233: if (zz != yy) PetscArraycpy(zarray, yarray, 4 * mbs);
2234: mbs = a->compressedrow.nrows;
2235: ii = a->compressedrow.i;
2236: ridx = a->compressedrow.rindex;
2237: } else {
2238: ii = a->i;
2239: y = yarray;
2240: z = zarray;
2241: }
2243: for (i = 0; i < mbs; i++) {
2244: n = ii[1] - ii[0];
2245: ii++;
2246: if (usecprow) {
2247: z = zarray + 4 * ridx[i];
2248: y = yarray + 4 * ridx[i];
2249: }
2250: sum1 = y[0];
2251: sum2 = y[1];
2252: sum3 = y[2];
2253: sum4 = y[3];
2254: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2255: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2256: for (j = 0; j < n; j++) {
2257: xb = x + 4 * (*idx++);
2258: x1 = xb[0];
2259: x2 = xb[1];
2260: x3 = xb[2];
2261: x4 = xb[3];
2262: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2263: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2264: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2265: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2266: v += 16;
2267: }
2268: z[0] = sum1;
2269: z[1] = sum2;
2270: z[2] = sum3;
2271: z[3] = sum4;
2272: if (!usecprow) {
2273: z += 4;
2274: y += 4;
2275: }
2276: }
2277: VecRestoreArrayRead(xx, &x);
2278: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2279: PetscLogFlops(32.0 * a->nz);
2280: return 0;
2281: }
2283: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2284: {
2285: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2286: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2287: const PetscScalar *x, *xb;
2288: PetscScalar *yarray, *zarray;
2289: const MatScalar *v;
2290: PetscInt mbs = a->mbs, i, j, n;
2291: const PetscInt *idx, *ii, *ridx = NULL;
2292: PetscBool usecprow = a->compressedrow.use;
2294: VecGetArrayRead(xx, &x);
2295: VecGetArrayPair(yy, zz, &yarray, &zarray);
2297: idx = a->j;
2298: v = a->a;
2299: if (usecprow) {
2300: if (zz != yy) PetscArraycpy(zarray, yarray, 5 * mbs);
2301: mbs = a->compressedrow.nrows;
2302: ii = a->compressedrow.i;
2303: ridx = a->compressedrow.rindex;
2304: } else {
2305: ii = a->i;
2306: y = yarray;
2307: z = zarray;
2308: }
2310: for (i = 0; i < mbs; i++) {
2311: n = ii[1] - ii[0];
2312: ii++;
2313: if (usecprow) {
2314: z = zarray + 5 * ridx[i];
2315: y = yarray + 5 * ridx[i];
2316: }
2317: sum1 = y[0];
2318: sum2 = y[1];
2319: sum3 = y[2];
2320: sum4 = y[3];
2321: sum5 = y[4];
2322: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2323: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2324: for (j = 0; j < n; j++) {
2325: xb = x + 5 * (*idx++);
2326: x1 = xb[0];
2327: x2 = xb[1];
2328: x3 = xb[2];
2329: x4 = xb[3];
2330: x5 = xb[4];
2331: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2332: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2333: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2334: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2335: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2336: v += 25;
2337: }
2338: z[0] = sum1;
2339: z[1] = sum2;
2340: z[2] = sum3;
2341: z[3] = sum4;
2342: z[4] = sum5;
2343: if (!usecprow) {
2344: z += 5;
2345: y += 5;
2346: }
2347: }
2348: VecRestoreArrayRead(xx, &x);
2349: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2350: PetscLogFlops(50.0 * a->nz);
2351: return 0;
2352: }
2354: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2355: {
2356: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2357: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2358: const PetscScalar *x, *xb;
2359: PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2360: const MatScalar *v;
2361: PetscInt mbs = a->mbs, i, j, n;
2362: const PetscInt *idx, *ii, *ridx = NULL;
2363: PetscBool usecprow = a->compressedrow.use;
2365: VecGetArrayRead(xx, &x);
2366: VecGetArrayPair(yy, zz, &yarray, &zarray);
2368: idx = a->j;
2369: v = a->a;
2370: if (usecprow) {
2371: if (zz != yy) PetscArraycpy(zarray, yarray, 6 * mbs);
2372: mbs = a->compressedrow.nrows;
2373: ii = a->compressedrow.i;
2374: ridx = a->compressedrow.rindex;
2375: } else {
2376: ii = a->i;
2377: y = yarray;
2378: z = zarray;
2379: }
2381: for (i = 0; i < mbs; i++) {
2382: n = ii[1] - ii[0];
2383: ii++;
2384: if (usecprow) {
2385: z = zarray + 6 * ridx[i];
2386: y = yarray + 6 * ridx[i];
2387: }
2388: sum1 = y[0];
2389: sum2 = y[1];
2390: sum3 = y[2];
2391: sum4 = y[3];
2392: sum5 = y[4];
2393: sum6 = y[5];
2394: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2395: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2396: for (j = 0; j < n; j++) {
2397: xb = x + 6 * (*idx++);
2398: x1 = xb[0];
2399: x2 = xb[1];
2400: x3 = xb[2];
2401: x4 = xb[3];
2402: x5 = xb[4];
2403: x6 = xb[5];
2404: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2405: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2406: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2407: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2408: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2409: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2410: v += 36;
2411: }
2412: z[0] = sum1;
2413: z[1] = sum2;
2414: z[2] = sum3;
2415: z[3] = sum4;
2416: z[4] = sum5;
2417: z[5] = sum6;
2418: if (!usecprow) {
2419: z += 6;
2420: y += 6;
2421: }
2422: }
2423: VecRestoreArrayRead(xx, &x);
2424: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2425: PetscLogFlops(72.0 * a->nz);
2426: return 0;
2427: }
2429: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2430: {
2431: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2432: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2433: const PetscScalar *x, *xb;
2434: PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2435: const MatScalar *v;
2436: PetscInt mbs = a->mbs, i, j, n;
2437: const PetscInt *idx, *ii, *ridx = NULL;
2438: PetscBool usecprow = a->compressedrow.use;
2440: VecGetArrayRead(xx, &x);
2441: VecGetArrayPair(yy, zz, &yarray, &zarray);
2443: idx = a->j;
2444: v = a->a;
2445: if (usecprow) {
2446: if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2447: mbs = a->compressedrow.nrows;
2448: ii = a->compressedrow.i;
2449: ridx = a->compressedrow.rindex;
2450: } else {
2451: ii = a->i;
2452: y = yarray;
2453: z = zarray;
2454: }
2456: for (i = 0; i < mbs; i++) {
2457: n = ii[1] - ii[0];
2458: ii++;
2459: if (usecprow) {
2460: z = zarray + 7 * ridx[i];
2461: y = yarray + 7 * ridx[i];
2462: }
2463: sum1 = y[0];
2464: sum2 = y[1];
2465: sum3 = y[2];
2466: sum4 = y[3];
2467: sum5 = y[4];
2468: sum6 = y[5];
2469: sum7 = y[6];
2470: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2471: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2472: for (j = 0; j < n; j++) {
2473: xb = x + 7 * (*idx++);
2474: x1 = xb[0];
2475: x2 = xb[1];
2476: x3 = xb[2];
2477: x4 = xb[3];
2478: x5 = xb[4];
2479: x6 = xb[5];
2480: x7 = xb[6];
2481: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2482: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2483: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2484: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2485: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2486: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2487: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2488: v += 49;
2489: }
2490: z[0] = sum1;
2491: z[1] = sum2;
2492: z[2] = sum3;
2493: z[3] = sum4;
2494: z[4] = sum5;
2495: z[5] = sum6;
2496: z[6] = sum7;
2497: if (!usecprow) {
2498: z += 7;
2499: y += 7;
2500: }
2501: }
2502: VecRestoreArrayRead(xx, &x);
2503: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2504: PetscLogFlops(98.0 * a->nz);
2505: return 0;
2506: }
2508: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2509: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2510: {
2511: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2512: PetscScalar *z = NULL, *work, *workt, *zarray;
2513: const PetscScalar *x, *xb;
2514: const MatScalar *v;
2515: PetscInt mbs, i, j, n;
2516: PetscInt k;
2517: PetscBool usecprow = a->compressedrow.use;
2518: const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2520: __m256d a0, a1, a2, a3, a4, a5;
2521: __m256d w0, w1, w2, w3;
2522: __m256d z0, z1, z2;
2523: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2525: VecCopy(yy, zz);
2526: VecGetArrayRead(xx, &x);
2527: VecGetArray(zz, &zarray);
2529: idx = a->j;
2530: v = a->a;
2531: if (usecprow) {
2532: mbs = a->compressedrow.nrows;
2533: ii = a->compressedrow.i;
2534: ridx = a->compressedrow.rindex;
2535: } else {
2536: mbs = a->mbs;
2537: ii = a->i;
2538: z = zarray;
2539: }
2541: if (!a->mult_work) {
2542: k = PetscMax(A->rmap->n, A->cmap->n);
2543: PetscMalloc1(k + 1, &a->mult_work);
2544: }
2546: work = a->mult_work;
2547: for (i = 0; i < mbs; i++) {
2548: n = ii[1] - ii[0];
2549: ii++;
2550: workt = work;
2551: for (j = 0; j < n; j++) {
2552: xb = x + bs * (*idx++);
2553: for (k = 0; k < bs; k++) workt[k] = xb[k];
2554: workt += bs;
2555: }
2556: if (usecprow) z = zarray + bs * ridx[i];
2558: z0 = _mm256_loadu_pd(&z[0]);
2559: z1 = _mm256_loadu_pd(&z[4]);
2560: z2 = _mm256_set1_pd(z[8]);
2562: for (j = 0; j < n; j++) {
2563: /* first column of a */
2564: w0 = _mm256_set1_pd(work[j * 9]);
2565: a0 = _mm256_loadu_pd(&v[j * 81]);
2566: z0 = _mm256_fmadd_pd(a0, w0, z0);
2567: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2568: z1 = _mm256_fmadd_pd(a1, w0, z1);
2569: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2570: z2 = _mm256_fmadd_pd(a2, w0, z2);
2572: /* second column of a */
2573: w1 = _mm256_set1_pd(work[j * 9 + 1]);
2574: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2575: z0 = _mm256_fmadd_pd(a0, w1, z0);
2576: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2577: z1 = _mm256_fmadd_pd(a1, w1, z1);
2578: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2579: z2 = _mm256_fmadd_pd(a2, w1, z2);
2581: /* third column of a */
2582: w2 = _mm256_set1_pd(work[j * 9 + 2]);
2583: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2584: z0 = _mm256_fmadd_pd(a3, w2, z0);
2585: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2586: z1 = _mm256_fmadd_pd(a4, w2, z1);
2587: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2588: z2 = _mm256_fmadd_pd(a5, w2, z2);
2590: /* fourth column of a */
2591: w3 = _mm256_set1_pd(work[j * 9 + 3]);
2592: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2593: z0 = _mm256_fmadd_pd(a0, w3, z0);
2594: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2595: z1 = _mm256_fmadd_pd(a1, w3, z1);
2596: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2597: z2 = _mm256_fmadd_pd(a2, w3, z2);
2599: /* fifth column of a */
2600: w0 = _mm256_set1_pd(work[j * 9 + 4]);
2601: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2602: z0 = _mm256_fmadd_pd(a3, w0, z0);
2603: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2604: z1 = _mm256_fmadd_pd(a4, w0, z1);
2605: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2606: z2 = _mm256_fmadd_pd(a5, w0, z2);
2608: /* sixth column of a */
2609: w1 = _mm256_set1_pd(work[j * 9 + 5]);
2610: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2611: z0 = _mm256_fmadd_pd(a0, w1, z0);
2612: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2613: z1 = _mm256_fmadd_pd(a1, w1, z1);
2614: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2615: z2 = _mm256_fmadd_pd(a2, w1, z2);
2617: /* seventh column of a */
2618: w2 = _mm256_set1_pd(work[j * 9 + 6]);
2619: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2620: z0 = _mm256_fmadd_pd(a0, w2, z0);
2621: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2622: z1 = _mm256_fmadd_pd(a1, w2, z1);
2623: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2624: z2 = _mm256_fmadd_pd(a2, w2, z2);
2626: /* eighth column of a */
2627: w3 = _mm256_set1_pd(work[j * 9 + 7]);
2628: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2629: z0 = _mm256_fmadd_pd(a3, w3, z0);
2630: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2631: z1 = _mm256_fmadd_pd(a4, w3, z1);
2632: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2633: z2 = _mm256_fmadd_pd(a5, w3, z2);
2635: /* ninth column of a */
2636: w0 = _mm256_set1_pd(work[j * 9 + 8]);
2637: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2638: z0 = _mm256_fmadd_pd(a0, w0, z0);
2639: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2640: z1 = _mm256_fmadd_pd(a1, w0, z1);
2641: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2642: z2 = _mm256_fmadd_pd(a2, w0, z2);
2643: }
2645: _mm256_storeu_pd(&z[0], z0);
2646: _mm256_storeu_pd(&z[4], z1);
2647: _mm256_maskstore_pd(&z[8], mask1, z2);
2649: v += n * bs2;
2650: if (!usecprow) z += bs;
2651: }
2652: VecRestoreArrayRead(xx, &x);
2653: VecRestoreArray(zz, &zarray);
2654: PetscLogFlops(162.0 * a->nz);
2655: return 0;
2656: }
2657: #endif
2659: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2660: {
2661: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2662: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2663: const PetscScalar *x, *xb;
2664: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2665: const MatScalar *v;
2666: PetscInt mbs = a->mbs, i, j, n;
2667: const PetscInt *idx, *ii, *ridx = NULL;
2668: PetscBool usecprow = a->compressedrow.use;
2670: VecGetArrayRead(xx, &x);
2671: VecGetArrayPair(yy, zz, &yarray, &zarray);
2673: idx = a->j;
2674: v = a->a;
2675: if (usecprow) {
2676: if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2677: mbs = a->compressedrow.nrows;
2678: ii = a->compressedrow.i;
2679: ridx = a->compressedrow.rindex;
2680: } else {
2681: ii = a->i;
2682: y = yarray;
2683: z = zarray;
2684: }
2686: for (i = 0; i < mbs; i++) {
2687: n = ii[1] - ii[0];
2688: ii++;
2689: if (usecprow) {
2690: z = zarray + 11 * ridx[i];
2691: y = yarray + 11 * ridx[i];
2692: }
2693: sum1 = y[0];
2694: sum2 = y[1];
2695: sum3 = y[2];
2696: sum4 = y[3];
2697: sum5 = y[4];
2698: sum6 = y[5];
2699: sum7 = y[6];
2700: sum8 = y[7];
2701: sum9 = y[8];
2702: sum10 = y[9];
2703: sum11 = y[10];
2704: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2705: PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2706: for (j = 0; j < n; j++) {
2707: xb = x + 11 * (*idx++);
2708: x1 = xb[0];
2709: x2 = xb[1];
2710: x3 = xb[2];
2711: x4 = xb[3];
2712: x5 = xb[4];
2713: x6 = xb[5];
2714: x7 = xb[6];
2715: x8 = xb[7];
2716: x9 = xb[8];
2717: x10 = xb[9];
2718: x11 = xb[10];
2719: sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2720: sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2721: sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2722: sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2723: sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2724: sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2725: sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2726: sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2727: sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2728: sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2729: sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2730: v += 121;
2731: }
2732: z[0] = sum1;
2733: z[1] = sum2;
2734: z[2] = sum3;
2735: z[3] = sum4;
2736: z[4] = sum5;
2737: z[5] = sum6;
2738: z[6] = sum7;
2739: z[7] = sum8;
2740: z[8] = sum9;
2741: z[9] = sum10;
2742: z[10] = sum11;
2743: if (!usecprow) {
2744: z += 11;
2745: y += 11;
2746: }
2747: }
2748: VecRestoreArrayRead(xx, &x);
2749: VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2750: PetscLogFlops(242.0 * a->nz);
2751: return 0;
2752: }
2754: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2755: {
2756: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2757: PetscScalar *z = NULL, *work, *workt, *zarray;
2758: const PetscScalar *x, *xb;
2759: const MatScalar *v;
2760: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2761: PetscInt ncols, k;
2762: const PetscInt *ridx = NULL, *idx, *ii;
2763: PetscBool usecprow = a->compressedrow.use;
2765: VecCopy(yy, zz);
2766: VecGetArrayRead(xx, &x);
2767: VecGetArray(zz, &zarray);
2769: idx = a->j;
2770: v = a->a;
2771: if (usecprow) {
2772: mbs = a->compressedrow.nrows;
2773: ii = a->compressedrow.i;
2774: ridx = a->compressedrow.rindex;
2775: } else {
2776: mbs = a->mbs;
2777: ii = a->i;
2778: z = zarray;
2779: }
2781: if (!a->mult_work) {
2782: k = PetscMax(A->rmap->n, A->cmap->n);
2783: PetscMalloc1(k + 1, &a->mult_work);
2784: }
2785: work = a->mult_work;
2786: for (i = 0; i < mbs; i++) {
2787: n = ii[1] - ii[0];
2788: ii++;
2789: ncols = n * bs;
2790: workt = work;
2791: for (j = 0; j < n; j++) {
2792: xb = x + bs * (*idx++);
2793: for (k = 0; k < bs; k++) workt[k] = xb[k];
2794: workt += bs;
2795: }
2796: if (usecprow) z = zarray + bs * ridx[i];
2797: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2798: v += n * bs2;
2799: if (!usecprow) z += bs;
2800: }
2801: VecRestoreArrayRead(xx, &x);
2802: VecRestoreArray(zz, &zarray);
2803: PetscLogFlops(2.0 * a->nz * bs2);
2804: return 0;
2805: }
2807: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2808: {
2809: PetscScalar zero = 0.0;
2811: VecSet(zz, zero);
2812: MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2813: return 0;
2814: }
2816: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2817: {
2818: PetscScalar zero = 0.0;
2820: VecSet(zz, zero);
2821: MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2822: return 0;
2823: }
2825: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826: {
2827: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2828: PetscScalar *z, x1, x2, x3, x4, x5;
2829: const PetscScalar *x, *xb = NULL;
2830: const MatScalar *v;
2831: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2832: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2833: Mat_CompressedRow cprow = a->compressedrow;
2834: PetscBool usecprow = cprow.use;
2836: if (yy != zz) VecCopy(yy, zz);
2837: VecGetArrayRead(xx, &x);
2838: VecGetArray(zz, &z);
2840: idx = a->j;
2841: v = a->a;
2842: if (usecprow) {
2843: mbs = cprow.nrows;
2844: ii = cprow.i;
2845: ridx = cprow.rindex;
2846: } else {
2847: mbs = a->mbs;
2848: ii = a->i;
2849: xb = x;
2850: }
2852: switch (bs) {
2853: case 1:
2854: for (i = 0; i < mbs; i++) {
2855: if (usecprow) xb = x + ridx[i];
2856: x1 = xb[0];
2857: ib = idx + ii[0];
2858: n = ii[1] - ii[0];
2859: ii++;
2860: for (j = 0; j < n; j++) {
2861: rval = ib[j];
2862: z[rval] += PetscConj(*v) * x1;
2863: v++;
2864: }
2865: if (!usecprow) xb++;
2866: }
2867: break;
2868: case 2:
2869: for (i = 0; i < mbs; i++) {
2870: if (usecprow) xb = x + 2 * ridx[i];
2871: x1 = xb[0];
2872: x2 = xb[1];
2873: ib = idx + ii[0];
2874: n = ii[1] - ii[0];
2875: ii++;
2876: for (j = 0; j < n; j++) {
2877: rval = ib[j] * 2;
2878: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2879: z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2880: v += 4;
2881: }
2882: if (!usecprow) xb += 2;
2883: }
2884: break;
2885: case 3:
2886: for (i = 0; i < mbs; i++) {
2887: if (usecprow) xb = x + 3 * ridx[i];
2888: x1 = xb[0];
2889: x2 = xb[1];
2890: x3 = xb[2];
2891: ib = idx + ii[0];
2892: n = ii[1] - ii[0];
2893: ii++;
2894: for (j = 0; j < n; j++) {
2895: rval = ib[j] * 3;
2896: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2897: z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2898: z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2899: v += 9;
2900: }
2901: if (!usecprow) xb += 3;
2902: }
2903: break;
2904: case 4:
2905: for (i = 0; i < mbs; i++) {
2906: if (usecprow) xb = x + 4 * ridx[i];
2907: x1 = xb[0];
2908: x2 = xb[1];
2909: x3 = xb[2];
2910: x4 = xb[3];
2911: ib = idx + ii[0];
2912: n = ii[1] - ii[0];
2913: ii++;
2914: for (j = 0; j < n; j++) {
2915: rval = ib[j] * 4;
2916: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2917: z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2918: z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2919: z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2920: v += 16;
2921: }
2922: if (!usecprow) xb += 4;
2923: }
2924: break;
2925: case 5:
2926: for (i = 0; i < mbs; i++) {
2927: if (usecprow) xb = x + 5 * ridx[i];
2928: x1 = xb[0];
2929: x2 = xb[1];
2930: x3 = xb[2];
2931: x4 = xb[3];
2932: x5 = xb[4];
2933: ib = idx + ii[0];
2934: n = ii[1] - ii[0];
2935: ii++;
2936: for (j = 0; j < n; j++) {
2937: rval = ib[j] * 5;
2938: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2939: z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2940: z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2941: z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2942: z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2943: v += 25;
2944: }
2945: if (!usecprow) xb += 5;
2946: }
2947: break;
2948: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2949: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2950: #if 0
2951: {
2952: PetscInt ncols,k,bs2=a->bs2;
2953: PetscScalar *work,*workt,zb;
2954: const PetscScalar *xtmp;
2955: if (!a->mult_work) {
2956: k = PetscMax(A->rmap->n,A->cmap->n);
2957: PetscMalloc1(k+1,&a->mult_work);
2958: }
2959: work = a->mult_work;
2960: xtmp = x;
2961: for (i=0; i<mbs; i++) {
2962: n = ii[1] - ii[0]; ii++;
2963: ncols = n*bs;
2964: PetscArrayzero(work,ncols);
2965: if (usecprow) xtmp = x + bs*ridx[i];
2966: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2967: v += n*bs2;
2968: if (!usecprow) xtmp += bs;
2969: workt = work;
2970: for (j=0; j<n; j++) {
2971: zb = z + bs*(*idx++);
2972: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2973: workt += bs;
2974: }
2975: }
2976: }
2977: #endif
2978: }
2979: VecRestoreArrayRead(xx, &x);
2980: VecRestoreArray(zz, &z);
2981: PetscLogFlops(2.0 * a->nz * a->bs2);
2982: return 0;
2983: }
2985: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2986: {
2987: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2988: PetscScalar *zb, *z, x1, x2, x3, x4, x5;
2989: const PetscScalar *x, *xb = NULL;
2990: const MatScalar *v;
2991: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2992: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2993: Mat_CompressedRow cprow = a->compressedrow;
2994: PetscBool usecprow = cprow.use;
2996: if (yy != zz) VecCopy(yy, zz);
2997: VecGetArrayRead(xx, &x);
2998: VecGetArray(zz, &z);
3000: idx = a->j;
3001: v = a->a;
3002: if (usecprow) {
3003: mbs = cprow.nrows;
3004: ii = cprow.i;
3005: ridx = cprow.rindex;
3006: } else {
3007: mbs = a->mbs;
3008: ii = a->i;
3009: xb = x;
3010: }
3012: switch (bs) {
3013: case 1:
3014: for (i = 0; i < mbs; i++) {
3015: if (usecprow) xb = x + ridx[i];
3016: x1 = xb[0];
3017: ib = idx + ii[0];
3018: n = ii[1] - ii[0];
3019: ii++;
3020: for (j = 0; j < n; j++) {
3021: rval = ib[j];
3022: z[rval] += *v * x1;
3023: v++;
3024: }
3025: if (!usecprow) xb++;
3026: }
3027: break;
3028: case 2:
3029: for (i = 0; i < mbs; i++) {
3030: if (usecprow) xb = x + 2 * ridx[i];
3031: x1 = xb[0];
3032: x2 = xb[1];
3033: ib = idx + ii[0];
3034: n = ii[1] - ii[0];
3035: ii++;
3036: for (j = 0; j < n; j++) {
3037: rval = ib[j] * 2;
3038: z[rval++] += v[0] * x1 + v[1] * x2;
3039: z[rval++] += v[2] * x1 + v[3] * x2;
3040: v += 4;
3041: }
3042: if (!usecprow) xb += 2;
3043: }
3044: break;
3045: case 3:
3046: for (i = 0; i < mbs; i++) {
3047: if (usecprow) xb = x + 3 * ridx[i];
3048: x1 = xb[0];
3049: x2 = xb[1];
3050: x3 = xb[2];
3051: ib = idx + ii[0];
3052: n = ii[1] - ii[0];
3053: ii++;
3054: for (j = 0; j < n; j++) {
3055: rval = ib[j] * 3;
3056: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3057: z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3058: z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3059: v += 9;
3060: }
3061: if (!usecprow) xb += 3;
3062: }
3063: break;
3064: case 4:
3065: for (i = 0; i < mbs; i++) {
3066: if (usecprow) xb = x + 4 * ridx[i];
3067: x1 = xb[0];
3068: x2 = xb[1];
3069: x3 = xb[2];
3070: x4 = xb[3];
3071: ib = idx + ii[0];
3072: n = ii[1] - ii[0];
3073: ii++;
3074: for (j = 0; j < n; j++) {
3075: rval = ib[j] * 4;
3076: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3077: z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3078: z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3079: z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3080: v += 16;
3081: }
3082: if (!usecprow) xb += 4;
3083: }
3084: break;
3085: case 5:
3086: for (i = 0; i < mbs; i++) {
3087: if (usecprow) xb = x + 5 * ridx[i];
3088: x1 = xb[0];
3089: x2 = xb[1];
3090: x3 = xb[2];
3091: x4 = xb[3];
3092: x5 = xb[4];
3093: ib = idx + ii[0];
3094: n = ii[1] - ii[0];
3095: ii++;
3096: for (j = 0; j < n; j++) {
3097: rval = ib[j] * 5;
3098: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3099: z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3100: z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3101: z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3102: z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3103: v += 25;
3104: }
3105: if (!usecprow) xb += 5;
3106: }
3107: break;
3108: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3109: PetscInt ncols, k;
3110: PetscScalar *work, *workt;
3111: const PetscScalar *xtmp;
3112: if (!a->mult_work) {
3113: k = PetscMax(A->rmap->n, A->cmap->n);
3114: PetscMalloc1(k + 1, &a->mult_work);
3115: }
3116: work = a->mult_work;
3117: xtmp = x;
3118: for (i = 0; i < mbs; i++) {
3119: n = ii[1] - ii[0];
3120: ii++;
3121: ncols = n * bs;
3122: PetscArrayzero(work, ncols);
3123: if (usecprow) xtmp = x + bs * ridx[i];
3124: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3125: v += n * bs2;
3126: if (!usecprow) xtmp += bs;
3127: workt = work;
3128: for (j = 0; j < n; j++) {
3129: zb = z + bs * (*idx++);
3130: for (k = 0; k < bs; k++) zb[k] += workt[k];
3131: workt += bs;
3132: }
3133: }
3134: }
3135: }
3136: VecRestoreArrayRead(xx, &x);
3137: VecRestoreArray(zz, &z);
3138: PetscLogFlops(2.0 * a->nz * a->bs2);
3139: return 0;
3140: }
3142: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3143: {
3144: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3145: PetscInt totalnz = a->bs2 * a->nz;
3146: PetscScalar oalpha = alpha;
3147: PetscBLASInt one = 1, tnz;
3149: PetscBLASIntCast(totalnz, &tnz);
3150: PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3151: PetscLogFlops(totalnz);
3152: return 0;
3153: }
3155: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3156: {
3157: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3158: MatScalar *v = a->a;
3159: PetscReal sum = 0.0;
3160: PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3162: if (type == NORM_FROBENIUS) {
3163: #if defined(PETSC_USE_REAL___FP16)
3164: PetscBLASInt one = 1, cnt = bs2 * nz;
3165: PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3166: #else
3167: for (i = 0; i < bs2 * nz; i++) {
3168: sum += PetscRealPart(PetscConj(*v) * (*v));
3169: v++;
3170: }
3171: #endif
3172: *norm = PetscSqrtReal(sum);
3173: PetscLogFlops(2.0 * bs2 * nz);
3174: } else if (type == NORM_1) { /* maximum column sum */
3175: PetscReal *tmp;
3176: PetscInt *bcol = a->j;
3177: PetscCalloc1(A->cmap->n + 1, &tmp);
3178: for (i = 0; i < nz; i++) {
3179: for (j = 0; j < bs; j++) {
3180: k1 = bs * (*bcol) + j; /* column index */
3181: for (k = 0; k < bs; k++) {
3182: tmp[k1] += PetscAbsScalar(*v);
3183: v++;
3184: }
3185: }
3186: bcol++;
3187: }
3188: *norm = 0.0;
3189: for (j = 0; j < A->cmap->n; j++) {
3190: if (tmp[j] > *norm) *norm = tmp[j];
3191: }
3192: PetscFree(tmp);
3193: PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3194: } else if (type == NORM_INFINITY) { /* maximum row sum */
3195: *norm = 0.0;
3196: for (k = 0; k < bs; k++) {
3197: for (j = 0; j < a->mbs; j++) {
3198: v = a->a + bs2 * a->i[j] + k;
3199: sum = 0.0;
3200: for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3201: for (k1 = 0; k1 < bs; k1++) {
3202: sum += PetscAbsScalar(*v);
3203: v += bs;
3204: }
3205: }
3206: if (sum > *norm) *norm = sum;
3207: }
3208: }
3209: PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3210: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3211: return 0;
3212: }
3214: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3215: {
3216: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3218: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
3219: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3220: *flg = PETSC_FALSE;
3221: return 0;
3222: }
3224: /* if the a->i are the same */
3225: PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
3226: if (!*flg) return 0;
3228: /* if a->j are the same */
3229: PetscArraycmp(a->j, b->j, a->nz, flg);
3230: if (!*flg) return 0;
3232: /* if a->a are the same */
3233: PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg);
3234: return 0;
3235: }
3237: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3238: {
3239: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3240: PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3241: PetscScalar *x, zero = 0.0;
3242: MatScalar *aa, *aa_j;
3245: bs = A->rmap->bs;
3246: aa = a->a;
3247: ai = a->i;
3248: aj = a->j;
3249: ambs = a->mbs;
3250: bs2 = a->bs2;
3252: VecSet(v, zero);
3253: VecGetArray(v, &x);
3254: VecGetLocalSize(v, &n);
3256: for (i = 0; i < ambs; i++) {
3257: for (j = ai[i]; j < ai[i + 1]; j++) {
3258: if (aj[j] == i) {
3259: row = i * bs;
3260: aa_j = aa + j * bs2;
3261: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3262: break;
3263: }
3264: }
3265: }
3266: VecRestoreArray(v, &x);
3267: return 0;
3268: }
3270: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3271: {
3272: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3273: const PetscScalar *l, *r, *li, *ri;
3274: PetscScalar x;
3275: MatScalar *aa, *v;
3276: PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3277: const PetscInt *ai, *aj;
3279: ai = a->i;
3280: aj = a->j;
3281: aa = a->a;
3282: m = A->rmap->n;
3283: n = A->cmap->n;
3284: bs = A->rmap->bs;
3285: mbs = a->mbs;
3286: bs2 = a->bs2;
3287: if (ll) {
3288: VecGetArrayRead(ll, &l);
3289: VecGetLocalSize(ll, &lm);
3291: for (i = 0; i < mbs; i++) { /* for each block row */
3292: M = ai[i + 1] - ai[i];
3293: li = l + i * bs;
3294: v = aa + bs2 * ai[i];
3295: for (j = 0; j < M; j++) { /* for each block */
3296: for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3297: }
3298: }
3299: VecRestoreArrayRead(ll, &l);
3300: PetscLogFlops(a->nz);
3301: }
3303: if (rr) {
3304: VecGetArrayRead(rr, &r);
3305: VecGetLocalSize(rr, &rn);
3307: for (i = 0; i < mbs; i++) { /* for each block row */
3308: iai = ai[i];
3309: M = ai[i + 1] - iai;
3310: v = aa + bs2 * iai;
3311: for (j = 0; j < M; j++) { /* for each block */
3312: ri = r + bs * aj[iai + j];
3313: for (k = 0; k < bs; k++) {
3314: x = ri[k];
3315: for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3316: v += bs;
3317: }
3318: }
3319: }
3320: VecRestoreArrayRead(rr, &r);
3321: PetscLogFlops(a->nz);
3322: }
3323: return 0;
3324: }
3326: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3327: {
3328: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3330: info->block_size = a->bs2;
3331: info->nz_allocated = a->bs2 * a->maxnz;
3332: info->nz_used = a->bs2 * a->nz;
3333: info->nz_unneeded = info->nz_allocated - info->nz_used;
3334: info->assemblies = A->num_ass;
3335: info->mallocs = A->info.mallocs;
3336: info->memory = 0; /* REVIEW ME */
3337: if (A->factortype) {
3338: info->fill_ratio_given = A->info.fill_ratio_given;
3339: info->fill_ratio_needed = A->info.fill_ratio_needed;
3340: info->factor_mallocs = A->info.factor_mallocs;
3341: } else {
3342: info->fill_ratio_given = 0;
3343: info->fill_ratio_needed = 0;
3344: info->factor_mallocs = 0;
3345: }
3346: return 0;
3347: }
3349: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3350: {
3351: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3353: PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
3354: return 0;
3355: }
3357: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3358: {
3359: MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
3360: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3361: return 0;
3362: }
3364: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3365: {
3366: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3367: PetscScalar *z = NULL, sum1;
3368: const PetscScalar *xb;
3369: PetscScalar x1;
3370: const MatScalar *v, *vv;
3371: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3372: PetscBool usecprow = a->compressedrow.use;
3374: idx = a->j;
3375: v = a->a;
3376: if (usecprow) {
3377: mbs = a->compressedrow.nrows;
3378: ii = a->compressedrow.i;
3379: ridx = a->compressedrow.rindex;
3380: } else {
3381: mbs = a->mbs;
3382: ii = a->i;
3383: z = c;
3384: }
3386: for (i = 0; i < mbs; i++) {
3387: n = ii[1] - ii[0];
3388: ii++;
3389: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3390: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3391: if (usecprow) z = c + ridx[i];
3392: jj = idx;
3393: vv = v;
3394: for (k = 0; k < cn; k++) {
3395: idx = jj;
3396: v = vv;
3397: sum1 = 0.0;
3398: for (j = 0; j < n; j++) {
3399: xb = b + (*idx++);
3400: x1 = xb[0 + k * bm];
3401: sum1 += v[0] * x1;
3402: v += 1;
3403: }
3404: z[0 + k * cm] = sum1;
3405: }
3406: if (!usecprow) z += 1;
3407: }
3408: return 0;
3409: }
3411: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3412: {
3413: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3414: PetscScalar *z = NULL, sum1, sum2;
3415: const PetscScalar *xb;
3416: PetscScalar x1, x2;
3417: const MatScalar *v, *vv;
3418: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3419: PetscBool usecprow = a->compressedrow.use;
3421: idx = a->j;
3422: v = a->a;
3423: if (usecprow) {
3424: mbs = a->compressedrow.nrows;
3425: ii = a->compressedrow.i;
3426: ridx = a->compressedrow.rindex;
3427: } else {
3428: mbs = a->mbs;
3429: ii = a->i;
3430: z = c;
3431: }
3433: for (i = 0; i < mbs; i++) {
3434: n = ii[1] - ii[0];
3435: ii++;
3436: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3437: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3438: if (usecprow) z = c + 2 * ridx[i];
3439: jj = idx;
3440: vv = v;
3441: for (k = 0; k < cn; k++) {
3442: idx = jj;
3443: v = vv;
3444: sum1 = 0.0;
3445: sum2 = 0.0;
3446: for (j = 0; j < n; j++) {
3447: xb = b + 2 * (*idx++);
3448: x1 = xb[0 + k * bm];
3449: x2 = xb[1 + k * bm];
3450: sum1 += v[0] * x1 + v[2] * x2;
3451: sum2 += v[1] * x1 + v[3] * x2;
3452: v += 4;
3453: }
3454: z[0 + k * cm] = sum1;
3455: z[1 + k * cm] = sum2;
3456: }
3457: if (!usecprow) z += 2;
3458: }
3459: return 0;
3460: }
3462: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3463: {
3464: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3465: PetscScalar *z = NULL, sum1, sum2, sum3;
3466: const PetscScalar *xb;
3467: PetscScalar x1, x2, x3;
3468: const MatScalar *v, *vv;
3469: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3470: PetscBool usecprow = a->compressedrow.use;
3472: idx = a->j;
3473: v = a->a;
3474: if (usecprow) {
3475: mbs = a->compressedrow.nrows;
3476: ii = a->compressedrow.i;
3477: ridx = a->compressedrow.rindex;
3478: } else {
3479: mbs = a->mbs;
3480: ii = a->i;
3481: z = c;
3482: }
3484: for (i = 0; i < mbs; i++) {
3485: n = ii[1] - ii[0];
3486: ii++;
3487: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3488: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3489: if (usecprow) z = c + 3 * ridx[i];
3490: jj = idx;
3491: vv = v;
3492: for (k = 0; k < cn; k++) {
3493: idx = jj;
3494: v = vv;
3495: sum1 = 0.0;
3496: sum2 = 0.0;
3497: sum3 = 0.0;
3498: for (j = 0; j < n; j++) {
3499: xb = b + 3 * (*idx++);
3500: x1 = xb[0 + k * bm];
3501: x2 = xb[1 + k * bm];
3502: x3 = xb[2 + k * bm];
3503: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3504: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3505: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3506: v += 9;
3507: }
3508: z[0 + k * cm] = sum1;
3509: z[1 + k * cm] = sum2;
3510: z[2 + k * cm] = sum3;
3511: }
3512: if (!usecprow) z += 3;
3513: }
3514: return 0;
3515: }
3517: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3518: {
3519: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3520: PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3521: const PetscScalar *xb;
3522: PetscScalar x1, x2, x3, x4;
3523: const MatScalar *v, *vv;
3524: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3525: PetscBool usecprow = a->compressedrow.use;
3527: idx = a->j;
3528: v = a->a;
3529: if (usecprow) {
3530: mbs = a->compressedrow.nrows;
3531: ii = a->compressedrow.i;
3532: ridx = a->compressedrow.rindex;
3533: } else {
3534: mbs = a->mbs;
3535: ii = a->i;
3536: z = c;
3537: }
3539: for (i = 0; i < mbs; i++) {
3540: n = ii[1] - ii[0];
3541: ii++;
3542: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3543: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3544: if (usecprow) z = c + 4 * ridx[i];
3545: jj = idx;
3546: vv = v;
3547: for (k = 0; k < cn; k++) {
3548: idx = jj;
3549: v = vv;
3550: sum1 = 0.0;
3551: sum2 = 0.0;
3552: sum3 = 0.0;
3553: sum4 = 0.0;
3554: for (j = 0; j < n; j++) {
3555: xb = b + 4 * (*idx++);
3556: x1 = xb[0 + k * bm];
3557: x2 = xb[1 + k * bm];
3558: x3 = xb[2 + k * bm];
3559: x4 = xb[3 + k * bm];
3560: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3561: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3562: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3563: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3564: v += 16;
3565: }
3566: z[0 + k * cm] = sum1;
3567: z[1 + k * cm] = sum2;
3568: z[2 + k * cm] = sum3;
3569: z[3 + k * cm] = sum4;
3570: }
3571: if (!usecprow) z += 4;
3572: }
3573: return 0;
3574: }
3576: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3577: {
3578: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3579: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3580: const PetscScalar *xb;
3581: PetscScalar x1, x2, x3, x4, x5;
3582: const MatScalar *v, *vv;
3583: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3584: PetscBool usecprow = a->compressedrow.use;
3586: idx = a->j;
3587: v = a->a;
3588: if (usecprow) {
3589: mbs = a->compressedrow.nrows;
3590: ii = a->compressedrow.i;
3591: ridx = a->compressedrow.rindex;
3592: } else {
3593: mbs = a->mbs;
3594: ii = a->i;
3595: z = c;
3596: }
3598: for (i = 0; i < mbs; i++) {
3599: n = ii[1] - ii[0];
3600: ii++;
3601: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3602: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3603: if (usecprow) z = c + 5 * ridx[i];
3604: jj = idx;
3605: vv = v;
3606: for (k = 0; k < cn; k++) {
3607: idx = jj;
3608: v = vv;
3609: sum1 = 0.0;
3610: sum2 = 0.0;
3611: sum3 = 0.0;
3612: sum4 = 0.0;
3613: sum5 = 0.0;
3614: for (j = 0; j < n; j++) {
3615: xb = b + 5 * (*idx++);
3616: x1 = xb[0 + k * bm];
3617: x2 = xb[1 + k * bm];
3618: x3 = xb[2 + k * bm];
3619: x4 = xb[3 + k * bm];
3620: x5 = xb[4 + k * bm];
3621: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3622: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3623: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3624: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3625: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3626: v += 25;
3627: }
3628: z[0 + k * cm] = sum1;
3629: z[1 + k * cm] = sum2;
3630: z[2 + k * cm] = sum3;
3631: z[3 + k * cm] = sum4;
3632: z[4 + k * cm] = sum5;
3633: }
3634: if (!usecprow) z += 5;
3635: }
3636: return 0;
3637: }
3639: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3640: {
3641: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3642: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3643: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3644: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3645: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3646: PetscBLASInt bbs, bcn, bbm, bcm;
3647: PetscScalar *z = NULL;
3648: PetscScalar *c, *b;
3649: const MatScalar *v;
3650: const PetscInt *idx, *ii, *ridx = NULL;
3651: PetscScalar _DZero = 0.0, _DOne = 1.0;
3652: PetscBool usecprow = a->compressedrow.use;
3654: if (!cm || !cn) return 0;
3658: b = bd->v;
3659: if (a->nonzerorowcnt != A->rmap->n) MatZeroEntries(C);
3660: MatDenseGetArray(C, &c);
3661: switch (bs) {
3662: case 1:
3663: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3664: break;
3665: case 2:
3666: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3667: break;
3668: case 3:
3669: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3670: break;
3671: case 4:
3672: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3673: break;
3674: case 5:
3675: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3676: break;
3677: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3678: PetscBLASIntCast(bs, &bbs);
3679: PetscBLASIntCast(cn, &bcn);
3680: PetscBLASIntCast(bm, &bbm);
3681: PetscBLASIntCast(cm, &bcm);
3682: idx = a->j;
3683: v = a->a;
3684: if (usecprow) {
3685: mbs = a->compressedrow.nrows;
3686: ii = a->compressedrow.i;
3687: ridx = a->compressedrow.rindex;
3688: } else {
3689: mbs = a->mbs;
3690: ii = a->i;
3691: z = c;
3692: }
3693: for (i = 0; i < mbs; i++) {
3694: n = ii[1] - ii[0];
3695: ii++;
3696: if (usecprow) z = c + bs * ridx[i];
3697: if (n) {
3698: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3699: v += bs2;
3700: }
3701: for (j = 1; j < n; j++) {
3702: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3703: v += bs2;
3704: }
3705: if (!usecprow) z += bs;
3706: }
3707: }
3708: MatDenseRestoreArray(C, &c);
3709: PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn);
3710: return 0;
3711: }