Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
22: {
23: PetscInt i;
26: for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once(C, imax, is);
27: return 0;
28: }
30: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
31: {
32: PetscInt i;
35: for (i = 0; i < ov; ++i) MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is);
36: return 0;
37: }
39: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
40: {
41: MPI_Comm comm;
42: PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
43: PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
44: PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL;
45: const PetscInt *indices_i, **indices;
46: PetscLayout rmap;
47: PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner;
48: PetscSF sf;
49: PetscSFNode *remote;
51: PetscObjectGetComm((PetscObject)mat, &comm);
52: MPI_Comm_rank(comm, &rank);
53: MPI_Comm_size(comm, &size);
54: /* get row map to determine where rows should be going */
55: MatGetLayouts(mat, &rmap, NULL);
56: /* retrieve IS data and put all together so that we
57: * can optimize communication
58: * */
59: PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length);
60: for (i = 0, tlength = 0; i < nidx; i++) {
61: ISGetLocalSize(is[i], &length[i]);
62: tlength += length[i];
63: ISGetIndices(is[i], &indices[i]);
64: }
65: /* find these rows on remote processors */
66: PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids);
67: PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp);
68: nrrows = 0;
69: for (i = 0; i < nidx; i++) {
70: length_i = length[i];
71: indices_i = indices[i];
72: for (j = 0; j < length_i; j++) {
73: owner = -1;
74: PetscLayoutFindOwner(rmap, indices_i[j], &owner);
75: /* remote processors */
76: if (owner != rank) {
77: tosizes_temp[owner]++; /* number of rows to owner */
78: rrow_ranks[nrrows] = owner; /* processor */
79: rrow_isids[nrrows] = i; /* is id */
80: remoterows[nrrows++] = indices_i[j]; /* row */
81: }
82: }
83: ISRestoreIndices(is[i], &indices[i]);
84: }
85: PetscFree2(*(PetscInt ***)&indices, length);
86: /* test if we need to exchange messages
87: * generally speaking, we do not need to exchange
88: * data when overlap is 1
89: * */
90: MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm);
91: /* we do not have any messages
92: * It usually corresponds to overlap 1
93: * */
94: if (!reducednrrows) {
95: PetscFree3(toranks, tosizes, tosizes_temp);
96: PetscFree3(remoterows, rrow_ranks, rrow_isids);
97: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
98: return 0;
99: }
100: nto = 0;
101: /* send sizes and ranks for building a two-sided communcation */
102: for (i = 0; i < size; i++) {
103: if (tosizes_temp[i]) {
104: tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
105: tosizes_temp[i] = nto; /* a map from processor to index */
106: toranks[nto++] = i; /* processor */
107: }
108: }
109: PetscMalloc1(nto + 1, &toffsets);
110: toffsets[0] = 0;
111: for (i = 0; i < nto; i++) {
112: toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */
113: tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */
114: }
115: /* send information to other processors */
116: PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes);
117: nrecvrows = 0;
118: for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
119: PetscMalloc1(nrecvrows, &remote);
120: nrecvrows = 0;
121: for (i = 0; i < nfrom; i++) {
122: for (j = 0; j < fromsizes[2 * i]; j++) {
123: remote[nrecvrows].rank = fromranks[i];
124: remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
125: }
126: }
127: PetscSFCreate(comm, &sf);
128: PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
129: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
130: PetscSFSetType(sf, PETSCSFBASIC);
131: PetscSFSetFromOptions(sf);
132: /* message pair <no of is, row> */
133: PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata);
134: for (i = 0; i < nrrows; i++) {
135: owner = rrow_ranks[i]; /* processor */
136: j = tosizes_temp[owner]; /* index */
137: todata[toffsets[j]++] = rrow_isids[i];
138: todata[toffsets[j]++] = remoterows[i];
139: }
140: PetscFree3(toranks, tosizes, tosizes_temp);
141: PetscFree3(remoterows, rrow_ranks, rrow_isids);
142: PetscFree(toffsets);
143: PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
144: PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE);
145: PetscSFDestroy(&sf);
146: /* send rows belonging to the remote so that then we could get the overlapping data back */
147: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata);
148: PetscFree2(todata, fromdata);
149: PetscFree(fromsizes);
150: PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes);
151: PetscFree(fromranks);
152: nrecvrows = 0;
153: for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
154: PetscCalloc1(nrecvrows, &todata);
155: PetscMalloc1(nrecvrows, &remote);
156: nrecvrows = 0;
157: for (i = 0; i < nto; i++) {
158: for (j = 0; j < tosizes[2 * i]; j++) {
159: remote[nrecvrows].rank = toranks[i];
160: remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
161: }
162: }
163: PetscSFCreate(comm, &sf);
164: PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER);
165: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
166: PetscSFSetType(sf, PETSCSFBASIC);
167: PetscSFSetFromOptions(sf);
168: /* overlap communication and computation */
169: PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
170: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is);
171: PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE);
172: PetscSFDestroy(&sf);
173: PetscFree2(sbdata, sbsizes);
174: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata);
175: PetscFree(toranks);
176: PetscFree(tosizes);
177: PetscFree(todata);
178: return 0;
179: }
181: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
182: {
183: PetscInt *isz, isz_i, i, j, is_id, data_size;
184: PetscInt col, lsize, max_lsize, *indices_temp, *indices_i;
185: const PetscInt *indices_i_temp;
186: MPI_Comm *iscomms;
188: max_lsize = 0;
189: PetscMalloc1(nidx, &isz);
190: for (i = 0; i < nidx; i++) {
191: ISGetLocalSize(is[i], &lsize);
192: max_lsize = lsize > max_lsize ? lsize : max_lsize;
193: isz[i] = lsize;
194: }
195: PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms);
196: for (i = 0; i < nidx; i++) {
197: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
198: ISGetIndices(is[i], &indices_i_temp);
199: PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]);
200: ISRestoreIndices(is[i], &indices_i_temp);
201: ISDestroy(&is[i]);
202: }
203: /* retrieve information to get row id and its overlap */
204: for (i = 0; i < nrecvs;) {
205: is_id = recvdata[i++];
206: data_size = recvdata[i++];
207: indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
208: isz_i = isz[is_id];
209: for (j = 0; j < data_size; j++) {
210: col = recvdata[i++];
211: indices_i[isz_i++] = col;
212: }
213: isz[is_id] = isz_i;
214: }
215: /* remove duplicate entities */
216: for (i = 0; i < nidx; i++) {
217: indices_i = indices_temp + (max_lsize + nrecvs) * i;
218: isz_i = isz[i];
219: PetscSortRemoveDupsInt(&isz_i, indices_i);
220: ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]);
221: PetscCommDestroy(&iscomms[i]);
222: }
223: PetscFree(isz);
224: PetscFree2(indices_temp, iscomms);
225: return 0;
226: }
228: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
229: {
230: PetscLayout rmap, cmap;
231: PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
232: PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
233: PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
234: const PetscInt *gcols, *ai, *aj, *bi, *bj;
235: Mat amat, bmat;
236: PetscMPIInt rank;
237: PetscBool done;
238: MPI_Comm comm;
240: PetscObjectGetComm((PetscObject)mat, &comm);
241: MPI_Comm_rank(comm, &rank);
242: MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
243: /* Even if the mat is symmetric, we still assume it is not symmetric */
244: MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
246: MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
248: /* total number of nonzero values is used to estimate the memory usage in the next step */
249: tnz = ai[an] + bi[bn];
250: MatGetLayouts(mat, &rmap, &cmap);
251: PetscLayoutGetRange(rmap, &rstart, NULL);
252: PetscLayoutGetRange(cmap, &cstart, NULL);
253: /* to find the longest message */
254: max_fszs = 0;
255: for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
256: /* better way to estimate number of nonzero in the mat??? */
257: PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp);
258: for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
259: rows_pos = 0;
260: totalrows = 0;
261: for (i = 0; i < nfrom; i++) {
262: PetscArrayzero(rows_pos_i, nidx);
263: /* group data together */
264: for (j = 0; j < fromsizes[2 * i]; j += 2) {
265: is_id = fromrows[rows_pos++]; /* no of is */
266: rows_i = rows_data[is_id];
267: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
268: }
269: /* estimate a space to avoid multiple allocations */
270: for (j = 0; j < nidx; j++) {
271: indvc_ij = 0;
272: rows_i = rows_data[j];
273: for (l = 0; l < rows_pos_i[j]; l++) {
274: row = rows_i[l] - rstart;
275: start = ai[row];
276: end = ai[row + 1];
277: for (k = start; k < end; k++) { /* Amat */
278: col = aj[k] + cstart;
279: indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
280: }
281: start = bi[row];
282: end = bi[row + 1];
283: for (k = start; k < end; k++) { /* Bmat */
284: col = gcols[bj[k]];
285: indices_tmp[indvc_ij++] = col;
286: }
287: }
288: PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
289: indv_counts[i * nidx + j] = indvc_ij;
290: totalrows += indvc_ij;
291: }
292: }
293: /* message triple <no of is, number of rows, rows> */
294: PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes);
295: totalrows = 0;
296: rows_pos = 0;
297: /* use this code again */
298: for (i = 0; i < nfrom; i++) {
299: PetscArrayzero(rows_pos_i, nidx);
300: for (j = 0; j < fromsizes[2 * i]; j += 2) {
301: is_id = fromrows[rows_pos++];
302: rows_i = rows_data[is_id];
303: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
304: }
305: /* add data */
306: for (j = 0; j < nidx; j++) {
307: if (!indv_counts[i * nidx + j]) continue;
308: indvc_ij = 0;
309: sbdata[totalrows++] = j;
310: sbdata[totalrows++] = indv_counts[i * nidx + j];
311: sbsizes[2 * i] += 2;
312: rows_i = rows_data[j];
313: for (l = 0; l < rows_pos_i[j]; l++) {
314: row = rows_i[l] - rstart;
315: start = ai[row];
316: end = ai[row + 1];
317: for (k = start; k < end; k++) { /* Amat */
318: col = aj[k] + cstart;
319: indices_tmp[indvc_ij++] = col;
320: }
321: start = bi[row];
322: end = bi[row + 1];
323: for (k = start; k < end; k++) { /* Bmat */
324: col = gcols[bj[k]];
325: indices_tmp[indvc_ij++] = col;
326: }
327: }
328: PetscSortRemoveDupsInt(&indvc_ij, indices_tmp);
329: sbsizes[2 * i] += indvc_ij;
330: PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij);
331: totalrows += indvc_ij;
332: }
333: }
334: PetscMalloc1(nfrom + 1, &offsets);
335: offsets[0] = 0;
336: for (i = 0; i < nfrom; i++) {
337: offsets[i + 1] = offsets[i] + sbsizes[2 * i];
338: sbsizes[2 * i + 1] = offsets[i];
339: }
340: PetscFree(offsets);
341: if (sbrowsizes) *sbrowsizes = sbsizes;
342: if (sbrows) *sbrows = sbdata;
343: PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp);
344: MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
345: MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
346: return 0;
347: }
349: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
350: {
351: const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
352: PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
353: PetscInt lsize, lsize_tmp;
354: PetscMPIInt rank, owner;
355: Mat amat, bmat;
356: PetscBool done;
357: PetscLayout cmap, rmap;
358: MPI_Comm comm;
360: PetscObjectGetComm((PetscObject)mat, &comm);
361: MPI_Comm_rank(comm, &rank);
362: MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols);
363: MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
365: MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
367: /* is it a safe way to compute number of nonzero values ? */
368: tnz = ai[an] + bi[bn];
369: MatGetLayouts(mat, &rmap, &cmap);
370: PetscLayoutGetRange(rmap, &rstart, NULL);
371: PetscLayoutGetRange(cmap, &cstart, NULL);
372: /* it is a better way to estimate memory than the old implementation
373: * where global size of matrix is used
374: * */
375: PetscMalloc1(tnz, &indices_temp);
376: for (i = 0; i < nidx; i++) {
377: MPI_Comm iscomm;
379: ISGetLocalSize(is[i], &lsize);
380: ISGetIndices(is[i], &indices);
381: lsize_tmp = 0;
382: for (j = 0; j < lsize; j++) {
383: owner = -1;
384: row = indices[j];
385: PetscLayoutFindOwner(rmap, row, &owner);
386: if (owner != rank) continue;
387: /* local number */
388: row -= rstart;
389: start = ai[row];
390: end = ai[row + 1];
391: for (k = start; k < end; k++) { /* Amat */
392: col = aj[k] + cstart;
393: indices_temp[lsize_tmp++] = col;
394: }
395: start = bi[row];
396: end = bi[row + 1];
397: for (k = start; k < end; k++) { /* Bmat */
398: col = gcols[bj[k]];
399: indices_temp[lsize_tmp++] = col;
400: }
401: }
402: ISRestoreIndices(is[i], &indices);
403: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL);
404: ISDestroy(&is[i]);
405: PetscSortRemoveDupsInt(&lsize_tmp, indices_temp);
406: ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]);
407: PetscCommDestroy(&iscomm);
408: }
409: PetscFree(indices_temp);
410: MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done);
411: MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done);
412: return 0;
413: }
415: /*
416: Sample message format:
417: If a processor A wants processor B to process some elements corresponding
418: to index sets is[1],is[5]
419: mesg [0] = 2 (no of index sets in the mesg)
420: -----------
421: mesg [1] = 1 => is[1]
422: mesg [2] = sizeof(is[1]);
423: -----------
424: mesg [3] = 5 => is[5]
425: mesg [4] = sizeof(is[5]);
426: -----------
427: mesg [5]
428: mesg [n] datas[1]
429: -----------
430: mesg[n+1]
431: mesg[m] data(is[5])
432: -----------
434: Notes:
435: nrqs - no of requests sent (or to be sent out)
436: nrqr - no of requests received (which have to be or which have been processed)
437: */
438: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
439: {
440: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
441: PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
442: const PetscInt **idx, *idx_i;
443: PetscInt *n, **data, len;
444: #if defined(PETSC_USE_CTABLE)
445: PetscTable *table_data, table_data_i;
446: PetscInt *tdata, tcount, tcount_max;
447: #else
448: PetscInt *data_i, *d_p;
449: #endif
450: PetscMPIInt size, rank, tag1, tag2, proc = 0;
451: PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
452: PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
453: PetscBT *table;
454: MPI_Comm comm;
455: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
456: MPI_Status *recv_status;
457: MPI_Comm *iscomms;
458: char *t_p;
460: PetscObjectGetComm((PetscObject)C, &comm);
461: size = c->size;
462: rank = c->rank;
463: M = C->rmap->N;
465: PetscObjectGetNewTag((PetscObject)C, &tag1);
466: PetscObjectGetNewTag((PetscObject)C, &tag2);
468: PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n);
470: for (i = 0; i < imax; i++) {
471: ISGetIndices(is[i], &idx[i]);
472: ISGetLocalSize(is[i], &n[i]);
473: }
475: /* evaluate communication - mesg to who,length of mesg, and buffer space
476: required. Based on this, buffers are allocated, and data copied into them */
477: PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4);
478: for (i = 0; i < imax; i++) {
479: PetscArrayzero(w4, size); /* initialise work vector*/
480: idx_i = idx[i];
481: len = n[i];
482: for (j = 0; j < len; j++) {
483: row = idx_i[j];
485: PetscLayoutFindOwner(C->rmap, row, &proc);
486: w4[proc]++;
487: }
488: for (j = 0; j < size; j++) {
489: if (w4[j]) {
490: w1[j] += w4[j];
491: w3[j]++;
492: }
493: }
494: }
496: nrqs = 0; /* no of outgoing messages */
497: msz = 0; /* total mesg length (for all proc */
498: w1[rank] = 0; /* no mesg sent to intself */
499: w3[rank] = 0;
500: for (i = 0; i < size; i++) {
501: if (w1[i]) {
502: w2[i] = 1;
503: nrqs++;
504: } /* there exists a message to proc i */
505: }
506: /* pa - is list of processors to communicate with */
507: PetscMalloc1(nrqs, &pa);
508: for (i = 0, j = 0; i < size; i++) {
509: if (w1[i]) {
510: pa[j] = i;
511: j++;
512: }
513: }
515: /* Each message would have a header = 1 + 2*(no of IS) + data */
516: for (i = 0; i < nrqs; i++) {
517: j = pa[i];
518: w1[j] += w2[j] + 2 * w3[j];
519: msz += w1[j];
520: }
522: /* Determine the number of messages to expect, their lengths, from from-ids */
523: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
524: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
526: /* Now post the Irecvs corresponding to these messages */
527: PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1);
529: /* Allocate Memory for outgoing messages */
530: PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr);
531: PetscArrayzero(outdat, size);
532: PetscArrayzero(ptr, size);
534: {
535: PetscInt *iptr = tmp, ict = 0;
536: for (i = 0; i < nrqs; i++) {
537: j = pa[i];
538: iptr += ict;
539: outdat[j] = iptr;
540: ict = w1[j];
541: }
542: }
544: /* Form the outgoing messages */
545: /* plug in the headers */
546: for (i = 0; i < nrqs; i++) {
547: j = pa[i];
548: outdat[j][0] = 0;
549: PetscArrayzero(outdat[j] + 1, 2 * w3[j]);
550: ptr[j] = outdat[j] + 2 * w3[j] + 1;
551: }
553: /* Memory for doing local proc's work */
554: {
555: PetscInt M_BPB_imax = 0;
556: #if defined(PETSC_USE_CTABLE)
557: PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
558: PetscMalloc1(imax, &table_data);
559: for (i = 0; i < imax; i++) PetscTableCreate(n[i], M, &table_data[i]);
560: PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p);
561: for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
562: #else
563: PetscInt Mimax = 0;
564: PetscIntMultError(M, imax, &Mimax);
565: PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax);
566: PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p);
567: for (i = 0; i < imax; i++) {
568: table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
569: data[i] = d_p + M * i;
570: }
571: #endif
572: }
574: /* Parse the IS and update local tables and the outgoing buf with the data */
575: {
576: PetscInt n_i, isz_i, *outdat_j, ctr_j;
577: PetscBT table_i;
579: for (i = 0; i < imax; i++) {
580: PetscArrayzero(ctr, size);
581: n_i = n[i];
582: table_i = table[i];
583: idx_i = idx[i];
584: #if defined(PETSC_USE_CTABLE)
585: table_data_i = table_data[i];
586: #else
587: data_i = data[i];
588: #endif
589: isz_i = isz[i];
590: for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
591: row = idx_i[j];
592: PetscLayoutFindOwner(C->rmap, row, &proc);
593: if (proc != rank) { /* copy to the outgoing buffer */
594: ctr[proc]++;
595: *ptr[proc] = row;
596: ptr[proc]++;
597: } else if (!PetscBTLookupSet(table_i, row)) {
598: #if defined(PETSC_USE_CTABLE)
599: PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
600: #else
601: data_i[isz_i] = row; /* Update the local table */
602: #endif
603: isz_i++;
604: }
605: }
606: /* Update the headers for the current IS */
607: for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
608: if ((ctr_j = ctr[j])) {
609: outdat_j = outdat[j];
610: k = ++outdat_j[0];
611: outdat_j[2 * k] = ctr_j;
612: outdat_j[2 * k - 1] = i;
613: }
614: }
615: isz[i] = isz_i;
616: }
617: }
619: /* Now post the sends */
620: PetscMalloc1(nrqs, &s_waits1);
621: for (i = 0; i < nrqs; ++i) {
622: j = pa[i];
623: MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i);
624: }
626: /* No longer need the original indices */
627: PetscMalloc1(imax, &iscomms);
628: for (i = 0; i < imax; ++i) {
629: ISRestoreIndices(is[i], idx + i);
630: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL);
631: }
632: PetscFree2(*(PetscInt ***)&idx, n);
634: for (i = 0; i < imax; ++i) ISDestroy(&is[i]);
636: /* Do Local work */
637: #if defined(PETSC_USE_CTABLE)
638: MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data);
639: #else
640: MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL);
641: #endif
643: /* Receive messages */
644: PetscMalloc1(nrqr, &recv_status);
645: MPI_Waitall(nrqr, r_waits1, recv_status);
646: MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
648: /* Phase 1 sends are complete - deallocate buffers */
649: PetscFree4(outdat, ptr, tmp, ctr);
650: PetscFree4(w1, w2, w3, w4);
652: PetscMalloc1(nrqr, &xdata);
653: PetscMalloc1(nrqr, &isz1);
654: MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1);
655: PetscFree(rbuf[0]);
656: PetscFree(rbuf);
658: /* Send the data back */
659: /* Do a global reduction to know the buffer space req for incoming messages */
660: {
661: PetscMPIInt *rw1;
663: PetscCalloc1(size, &rw1);
665: for (i = 0; i < nrqr; ++i) {
666: proc = recv_status[i].MPI_SOURCE;
669: rw1[proc] = isz1[i];
670: }
671: PetscFree(onodes1);
672: PetscFree(olengths1);
674: /* Determine the number of messages to expect, their lengths, from from-ids */
675: PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2);
676: PetscFree(rw1);
677: }
678: /* Now post the Irecvs corresponding to these messages */
679: PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2);
681: /* Now post the sends */
682: PetscMalloc1(nrqr, &s_waits2);
683: for (i = 0; i < nrqr; ++i) {
684: j = recv_status[i].MPI_SOURCE;
685: MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i);
686: }
688: /* receive work done on other processors */
689: {
690: PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax;
691: PetscMPIInt idex;
692: PetscBT table_i;
694: for (i = 0; i < nrqs; ++i) {
695: MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE);
696: /* Process the message */
697: rbuf2_i = rbuf2[idex];
698: ct1 = 2 * rbuf2_i[0] + 1;
699: jmax = rbuf2[idex][0];
700: for (j = 1; j <= jmax; j++) {
701: max = rbuf2_i[2 * j];
702: is_no = rbuf2_i[2 * j - 1];
703: isz_i = isz[is_no];
704: table_i = table[is_no];
705: #if defined(PETSC_USE_CTABLE)
706: table_data_i = table_data[is_no];
707: #else
708: data_i = data[is_no];
709: #endif
710: for (k = 0; k < max; k++, ct1++) {
711: row = rbuf2_i[ct1];
712: if (!PetscBTLookupSet(table_i, row)) {
713: #if defined(PETSC_USE_CTABLE)
714: PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES);
715: #else
716: data_i[isz_i] = row;
717: #endif
718: isz_i++;
719: }
720: }
721: isz[is_no] = isz_i;
722: }
723: }
725: MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
726: }
728: #if defined(PETSC_USE_CTABLE)
729: tcount_max = 0;
730: for (i = 0; i < imax; ++i) {
731: table_data_i = table_data[i];
732: PetscTableGetCount(table_data_i, &tcount);
733: if (tcount_max < tcount) tcount_max = tcount;
734: }
735: PetscMalloc1(tcount_max + 1, &tdata);
736: #endif
738: for (i = 0; i < imax; ++i) {
739: #if defined(PETSC_USE_CTABLE)
740: PetscTablePosition tpos;
741: table_data_i = table_data[i];
743: PetscTableGetHeadPosition(table_data_i, &tpos);
744: while (tpos) {
745: PetscTableGetNext(table_data_i, &tpos, &k, &j);
746: tdata[--j] = --k;
747: }
748: ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i);
749: #else
750: ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i);
751: #endif
752: PetscCommDestroy(&iscomms[i]);
753: }
755: PetscFree(iscomms);
756: PetscFree(onodes2);
757: PetscFree(olengths2);
759: PetscFree(pa);
760: PetscFree(rbuf2[0]);
761: PetscFree(rbuf2);
762: PetscFree(s_waits1);
763: PetscFree(r_waits1);
764: PetscFree(s_waits2);
765: PetscFree(r_waits2);
766: PetscFree(recv_status);
767: if (xdata) {
768: PetscFree(xdata[0]);
769: PetscFree(xdata);
770: }
771: PetscFree(isz1);
772: #if defined(PETSC_USE_CTABLE)
773: for (i = 0; i < imax; i++) PetscTableDestroy((PetscTable *)&table_data[i]);
774: PetscFree(table_data);
775: PetscFree(tdata);
776: PetscFree4(table, data, isz, t_p);
777: #else
778: PetscFree5(table, data, isz, d_p, t_p);
779: #endif
780: return 0;
781: }
783: /*
784: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
785: the work on the local processor.
787: Inputs:
788: C - MAT_MPIAIJ;
789: imax - total no of index sets processed at a time;
790: table - an array of char - size = m bits.
792: Output:
793: isz - array containing the count of the solution elements corresponding
794: to each index set;
795: data or table_data - pointer to the solutions
796: */
797: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data)
798: {
799: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
800: Mat A = c->A, B = c->B;
801: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
802: PetscInt start, end, val, max, rstart, cstart, *ai, *aj;
803: PetscInt *bi, *bj, *garray, i, j, k, row, isz_i;
804: PetscBT table_i;
805: #if defined(PETSC_USE_CTABLE)
806: PetscTable table_data_i;
807: PetscTablePosition tpos;
808: PetscInt tcount, *tdata;
809: #else
810: PetscInt *data_i;
811: #endif
813: rstart = C->rmap->rstart;
814: cstart = C->cmap->rstart;
815: ai = a->i;
816: aj = a->j;
817: bi = b->i;
818: bj = b->j;
819: garray = c->garray;
821: for (i = 0; i < imax; i++) {
822: #if defined(PETSC_USE_CTABLE)
823: /* copy existing entries of table_data_i into tdata[] */
824: table_data_i = table_data[i];
825: PetscTableGetCount(table_data_i, &tcount);
828: PetscMalloc1(tcount, &tdata);
829: PetscTableGetHeadPosition(table_data_i, &tpos);
830: while (tpos) {
831: PetscTableGetNext(table_data_i, &tpos, &row, &j);
832: tdata[--j] = --row;
834: }
835: #else
836: data_i = data[i];
837: #endif
838: table_i = table[i];
839: isz_i = isz[i];
840: max = isz[i];
842: for (j = 0; j < max; j++) {
843: #if defined(PETSC_USE_CTABLE)
844: row = tdata[j] - rstart;
845: #else
846: row = data_i[j] - rstart;
847: #endif
848: start = ai[row];
849: end = ai[row + 1];
850: for (k = start; k < end; k++) { /* Amat */
851: val = aj[k] + cstart;
852: if (!PetscBTLookupSet(table_i, val)) {
853: #if defined(PETSC_USE_CTABLE)
854: PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
855: #else
856: data_i[isz_i] = val;
857: #endif
858: isz_i++;
859: }
860: }
861: start = bi[row];
862: end = bi[row + 1];
863: for (k = start; k < end; k++) { /* Bmat */
864: val = garray[bj[k]];
865: if (!PetscBTLookupSet(table_i, val)) {
866: #if defined(PETSC_USE_CTABLE)
867: PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES);
868: #else
869: data_i[isz_i] = val;
870: #endif
871: isz_i++;
872: }
873: }
874: }
875: isz[i] = isz_i;
877: #if defined(PETSC_USE_CTABLE)
878: PetscFree(tdata);
879: #endif
880: }
881: return 0;
882: }
884: /*
885: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
886: and return the output
888: Input:
889: C - the matrix
890: nrqr - no of messages being processed.
891: rbuf - an array of pointers to the received requests
893: Output:
894: xdata - array of messages to be sent back
895: isz1 - size of each message
897: For better efficiency perhaps we should malloc separately each xdata[i],
898: then if a remalloc is required we need only copy the data for that one row
899: rather then all previous rows as it is now where a single large chunk of
900: memory is used.
902: */
903: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
904: {
905: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
906: Mat A = c->A, B = c->B;
907: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
908: PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
909: PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
910: PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
911: PetscInt *rbuf_i, kmax, rbuf_0;
912: PetscBT xtable;
914: m = C->rmap->N;
915: rstart = C->rmap->rstart;
916: cstart = C->cmap->rstart;
917: ai = a->i;
918: aj = a->j;
919: bi = b->i;
920: bj = b->j;
921: garray = c->garray;
923: for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
924: rbuf_i = rbuf[i];
925: rbuf_0 = rbuf_i[0];
926: ct += rbuf_0;
927: for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
928: }
930: if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
931: else max1 = 1;
932: mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
933: if (nrqr) {
934: PetscMalloc1(mem_estimate, &xdata[0]);
935: ++no_malloc;
936: }
937: PetscBTCreate(m, &xtable);
938: PetscArrayzero(isz1, nrqr);
940: ct3 = 0;
941: for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
942: rbuf_i = rbuf[i];
943: rbuf_0 = rbuf_i[0];
944: ct1 = 2 * rbuf_0 + 1;
945: ct2 = ct1;
946: ct3 += ct1;
947: for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
948: PetscBTMemzero(m, xtable);
949: oct2 = ct2;
950: kmax = rbuf_i[2 * j];
951: for (k = 0; k < kmax; k++, ct1++) {
952: row = rbuf_i[ct1];
953: if (!PetscBTLookupSet(xtable, row)) {
954: if (!(ct3 < mem_estimate)) {
955: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
956: PetscMalloc1(new_estimate, &tmp);
957: PetscArraycpy(tmp, xdata[0], mem_estimate);
958: PetscFree(xdata[0]);
959: xdata[0] = tmp;
960: mem_estimate = new_estimate;
961: ++no_malloc;
962: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
963: }
964: xdata[i][ct2++] = row;
965: ct3++;
966: }
967: }
968: for (k = oct2, max2 = ct2; k < max2; k++) {
969: row = xdata[i][k] - rstart;
970: start = ai[row];
971: end = ai[row + 1];
972: for (l = start; l < end; l++) {
973: val = aj[l] + cstart;
974: if (!PetscBTLookupSet(xtable, val)) {
975: if (!(ct3 < mem_estimate)) {
976: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
977: PetscMalloc1(new_estimate, &tmp);
978: PetscArraycpy(tmp, xdata[0], mem_estimate);
979: PetscFree(xdata[0]);
980: xdata[0] = tmp;
981: mem_estimate = new_estimate;
982: ++no_malloc;
983: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
984: }
985: xdata[i][ct2++] = val;
986: ct3++;
987: }
988: }
989: start = bi[row];
990: end = bi[row + 1];
991: for (l = start; l < end; l++) {
992: val = garray[bj[l]];
993: if (!PetscBTLookupSet(xtable, val)) {
994: if (!(ct3 < mem_estimate)) {
995: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
996: PetscMalloc1(new_estimate, &tmp);
997: PetscArraycpy(tmp, xdata[0], mem_estimate);
998: PetscFree(xdata[0]);
999: xdata[0] = tmp;
1000: mem_estimate = new_estimate;
1001: ++no_malloc;
1002: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1003: }
1004: xdata[i][ct2++] = val;
1005: ct3++;
1006: }
1007: }
1008: }
1009: /* Update the header*/
1010: xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1011: xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1012: }
1013: xdata[i][0] = rbuf_0;
1014: if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1015: isz1[i] = ct2; /* size of each message */
1016: }
1017: PetscBTDestroy(&xtable);
1018: PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc);
1019: return 0;
1020: }
1021: /* -------------------------------------------------------------------------*/
1022: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1023: /*
1024: Every processor gets the entire matrix
1025: */
1026: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1027: {
1028: Mat B;
1029: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1030: Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1031: PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1032: PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1033: PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1035: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1036: MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank);
1037: if (scall == MAT_INITIAL_MATRIX) {
1038: /* ----------------------------------------------------------------
1039: Tell every processor the number of nonzeros per row
1040: */
1041: PetscMalloc1(A->rmap->N, &lens);
1042: for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1043: PetscMalloc2(size, &recvcounts, size, &displs);
1044: for (i = 0; i < size; i++) {
1045: recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1046: displs[i] = A->rmap->range[i];
1047: }
1048: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1049: /* ---------------------------------------------------------------
1050: Create the sequential matrix of the same type as the local block diagonal
1051: */
1052: MatCreate(PETSC_COMM_SELF, &B);
1053: MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE);
1054: MatSetBlockSizesFromMats(B, A, A);
1055: MatSetType(B, ((PetscObject)a->A)->type_name);
1056: MatSeqAIJSetPreallocation(B, 0, lens);
1057: PetscCalloc1(2, Bin);
1058: **Bin = B;
1059: b = (Mat_SeqAIJ *)B->data;
1061: /*--------------------------------------------------------------------
1062: Copy my part of matrix column indices over
1063: */
1064: sendcount = ad->nz + bd->nz;
1065: jsendbuf = b->j + b->i[rstarts[rank]];
1066: a_jsendbuf = ad->j;
1067: b_jsendbuf = bd->j;
1068: n = A->rmap->rend - A->rmap->rstart;
1069: cnt = 0;
1070: for (i = 0; i < n; i++) {
1071: /* put in lower diagonal portion */
1072: m = bd->i[i + 1] - bd->i[i];
1073: while (m > 0) {
1074: /* is it above diagonal (in bd (compressed) numbering) */
1075: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1076: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1077: m--;
1078: }
1080: /* put in diagonal portion */
1081: for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1083: /* put in upper diagonal portion */
1084: while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1085: }
1088: /*--------------------------------------------------------------------
1089: Gather all column indices to all processors
1090: */
1091: for (i = 0; i < size; i++) {
1092: recvcounts[i] = 0;
1093: for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1094: }
1095: displs[0] = 0;
1096: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1097: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A));
1098: /*--------------------------------------------------------------------
1099: Assemble the matrix into useable form (note numerical values not yet set)
1100: */
1101: /* set the b->ilen (length of each row) values */
1102: PetscArraycpy(b->ilen, lens, A->rmap->N);
1103: /* set the b->i indices */
1104: b->i[0] = 0;
1105: for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1106: PetscFree(lens);
1107: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1108: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
1110: } else {
1111: B = **Bin;
1112: b = (Mat_SeqAIJ *)B->data;
1113: }
1115: /*--------------------------------------------------------------------
1116: Copy my part of matrix numerical values into the values location
1117: */
1118: if (flag == MAT_GET_VALUES) {
1119: const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1120: MatScalar *sendbuf, *recvbuf;
1122: MatSeqAIJGetArrayRead(a->A, &ada);
1123: MatSeqAIJGetArrayRead(a->B, &bda);
1124: sendcount = ad->nz + bd->nz;
1125: sendbuf = b->a + b->i[rstarts[rank]];
1126: a_sendbuf = ada;
1127: b_sendbuf = bda;
1128: b_sendj = bd->j;
1129: n = A->rmap->rend - A->rmap->rstart;
1130: cnt = 0;
1131: for (i = 0; i < n; i++) {
1132: /* put in lower diagonal portion */
1133: m = bd->i[i + 1] - bd->i[i];
1134: while (m > 0) {
1135: /* is it above diagonal (in bd (compressed) numbering) */
1136: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1137: sendbuf[cnt++] = *b_sendbuf++;
1138: m--;
1139: b_sendj++;
1140: }
1142: /* put in diagonal portion */
1143: for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1145: /* put in upper diagonal portion */
1146: while (m-- > 0) {
1147: sendbuf[cnt++] = *b_sendbuf++;
1148: b_sendj++;
1149: }
1150: }
1153: /* -----------------------------------------------------------------
1154: Gather all numerical values to all processors
1155: */
1156: if (!recvcounts) PetscMalloc2(size, &recvcounts, size, &displs);
1157: for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1158: displs[0] = 0;
1159: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1160: recvbuf = b->a;
1161: MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A));
1162: MatSeqAIJRestoreArrayRead(a->A, &ada);
1163: MatSeqAIJRestoreArrayRead(a->B, &bda);
1164: } /* endof (flag == MAT_GET_VALUES) */
1165: PetscFree2(recvcounts, displs);
1167: MatPropagateSymmetryOptions(A, B);
1168: return 0;
1169: }
1171: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1172: {
1173: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
1174: Mat submat, A = c->A, B = c->B;
1175: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1176: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1177: PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1178: const PetscInt *icol, *irow;
1179: PetscInt nrow, ncol, start;
1180: PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1181: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1182: PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1183: PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1184: PetscInt *lens, rmax, ncols, *cols, Crow;
1185: #if defined(PETSC_USE_CTABLE)
1186: PetscTable cmap, rmap;
1187: PetscInt *cmap_loc, *rmap_loc;
1188: #else
1189: PetscInt *cmap, *rmap;
1190: #endif
1191: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1192: PetscInt *cworkB, lwrite, *subcols, *row2proc;
1193: PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1194: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1195: MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4;
1196: MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1197: MPI_Status *r_status3 = NULL, *r_status4, *s_status4;
1198: MPI_Comm comm;
1199: PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1200: PetscMPIInt *onodes1, *olengths1, idex, end;
1201: Mat_SubSppt *smatis1;
1202: PetscBool isrowsorted, iscolsorted;
1207: MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
1208: MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a);
1209: PetscObjectGetComm((PetscObject)C, &comm);
1210: size = c->size;
1211: rank = c->rank;
1213: ISSorted(iscol[0], &iscolsorted);
1214: ISSorted(isrow[0], &isrowsorted);
1215: ISGetIndices(isrow[0], &irow);
1216: ISGetLocalSize(isrow[0], &nrow);
1217: if (allcolumns) {
1218: icol = NULL;
1219: ncol = C->cmap->N;
1220: } else {
1221: ISGetIndices(iscol[0], &icol);
1222: ISGetLocalSize(iscol[0], &ncol);
1223: }
1225: if (scall == MAT_INITIAL_MATRIX) {
1226: PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1228: /* Get some new tags to keep the communication clean */
1229: tag1 = ((PetscObject)C)->tag;
1230: PetscObjectGetNewTag((PetscObject)C, &tag2);
1231: PetscObjectGetNewTag((PetscObject)C, &tag3);
1233: /* evaluate communication - mesg to who, length of mesg, and buffer space
1234: required. Based on this, buffers are allocated, and data copied into them */
1235: PetscCalloc2(size, &w1, size, &w2);
1236: PetscMalloc1(nrow, &row2proc);
1238: /* w1[proc] = num of rows owned by proc -- to be requested */
1239: proc = 0;
1240: nrqs = 0; /* num of outgoing messages */
1241: for (j = 0; j < nrow; j++) {
1242: row = irow[j];
1243: if (!isrowsorted) proc = 0;
1244: while (row >= C->rmap->range[proc + 1]) proc++;
1245: w1[proc]++;
1246: row2proc[j] = proc; /* map row index to proc */
1248: if (proc != rank && !w2[proc]) {
1249: w2[proc] = 1;
1250: nrqs++;
1251: }
1252: }
1253: w1[rank] = 0; /* rows owned by self will not be requested */
1255: PetscMalloc1(nrqs, &pa); /*(proc -array)*/
1256: for (proc = 0, j = 0; proc < size; proc++) {
1257: if (w1[proc]) pa[j++] = proc;
1258: }
1260: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1261: msz = 0; /* total mesg length (for all procs) */
1262: for (i = 0; i < nrqs; i++) {
1263: proc = pa[i];
1264: w1[proc] += 3;
1265: msz += w1[proc];
1266: }
1267: PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);
1269: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1270: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1271: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
1273: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1274: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1275: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
1277: /* Now post the Irecvs corresponding to these messages */
1278: PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);
1280: PetscFree(onodes1);
1281: PetscFree(olengths1);
1283: /* Allocate Memory for outgoing messages */
1284: PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
1285: PetscArrayzero(sbuf1, size);
1286: PetscArrayzero(ptr, size);
1288: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1289: iptr = tmp;
1290: for (i = 0; i < nrqs; i++) {
1291: proc = pa[i];
1292: sbuf1[proc] = iptr;
1293: iptr += w1[proc];
1294: }
1296: /* Form the outgoing messages */
1297: /* Initialize the header space */
1298: for (i = 0; i < nrqs; i++) {
1299: proc = pa[i];
1300: PetscArrayzero(sbuf1[proc], 3);
1301: ptr[proc] = sbuf1[proc] + 3;
1302: }
1304: /* Parse the isrow and copy data into outbuf */
1305: PetscArrayzero(ctr, size);
1306: for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1307: proc = row2proc[j];
1308: if (proc != rank) { /* copy to the outgoing buf*/
1309: *ptr[proc] = irow[j];
1310: ctr[proc]++;
1311: ptr[proc]++;
1312: }
1313: }
1315: /* Update the headers for the current IS */
1316: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1317: if ((ctr_j = ctr[j])) {
1318: sbuf1_j = sbuf1[j];
1319: k = ++sbuf1_j[0];
1320: sbuf1_j[2 * k] = ctr_j;
1321: sbuf1_j[2 * k - 1] = 0;
1322: }
1323: }
1325: /* Now post the sends */
1326: PetscMalloc1(nrqs, &s_waits1);
1327: for (i = 0; i < nrqs; ++i) {
1328: proc = pa[i];
1329: MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i);
1330: }
1332: /* Post Receives to capture the buffer size */
1333: PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2);
1334: PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
1336: if (nrqs) rbuf2[0] = tmp + msz;
1337: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1339: for (i = 0; i < nrqs; ++i) {
1340: proc = pa[i];
1341: MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i);
1342: }
1344: PetscFree2(w1, w2);
1346: /* Send to other procs the buf size they should allocate */
1347: /* Receive messages*/
1348: PetscMalloc1(nrqr, &r_status1);
1349: PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
1351: MPI_Waitall(nrqr, r_waits1, r_status1);
1352: for (i = 0; i < nrqr; ++i) {
1353: req_size[i] = 0;
1354: rbuf1_i = rbuf1[i];
1355: start = 2 * rbuf1_i[0] + 1;
1356: MPI_Get_count(r_status1 + i, MPIU_INT, &end);
1357: PetscMalloc1(end, &sbuf2[i]);
1358: sbuf2_i = sbuf2[i];
1359: for (j = start; j < end; j++) {
1360: k = rbuf1_i[j] - rstart;
1361: ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1362: sbuf2_i[j] = ncols;
1363: req_size[i] += ncols;
1364: }
1365: req_source1[i] = r_status1[i].MPI_SOURCE;
1367: /* form the header */
1368: sbuf2_i[0] = req_size[i];
1369: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1371: MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
1372: }
1374: PetscFree(r_status1);
1375: PetscFree(r_waits1);
1377: /* rbuf2 is received, Post recv column indices a->j */
1378: MPI_Waitall(nrqs, r_waits2, r_status2);
1380: PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3);
1381: for (i = 0; i < nrqs; ++i) {
1382: PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
1383: req_source2[i] = r_status2[i].MPI_SOURCE;
1384: MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
1385: }
1387: /* Wait on sends1 and sends2 */
1388: PetscMalloc1(nrqs, &s_status1);
1389: MPI_Waitall(nrqs, s_waits1, s_status1);
1390: PetscFree(s_waits1);
1391: PetscFree(s_status1);
1393: MPI_Waitall(nrqr, s_waits2, s_status2);
1394: PetscFree4(r_status2, s_waits2, r_waits2, s_status2);
1396: /* Now allocate sending buffers for a->j, and send them off */
1397: PetscMalloc1(nrqr, &sbuf_aj);
1398: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1399: if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
1400: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1402: for (i = 0; i < nrqr; i++) { /* for each requested message */
1403: rbuf1_i = rbuf1[i];
1404: sbuf_aj_i = sbuf_aj[i];
1405: ct1 = 2 * rbuf1_i[0] + 1;
1406: ct2 = 0;
1409: kmax = rbuf1[i][2];
1410: for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1411: row = rbuf1_i[ct1] - rstart;
1412: nzA = ai[row + 1] - ai[row];
1413: nzB = bi[row + 1] - bi[row];
1414: ncols = nzA + nzB;
1415: cworkA = aj + ai[row];
1416: cworkB = bj + bi[row];
1418: /* load the column indices for this row into cols*/
1419: cols = sbuf_aj_i + ct2;
1421: lwrite = 0;
1422: for (l = 0; l < nzB; l++) {
1423: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1424: }
1425: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1426: for (l = 0; l < nzB; l++) {
1427: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1428: }
1430: ct2 += ncols;
1431: }
1432: MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
1433: }
1435: /* create column map (cmap): global col of C -> local col of submat */
1436: #if defined(PETSC_USE_CTABLE)
1437: if (!allcolumns) {
1438: PetscTableCreate(ncol, C->cmap->N, &cmap);
1439: PetscCalloc1(C->cmap->n, &cmap_loc);
1440: for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1441: if (icol[j] >= cstart && icol[j] < cend) {
1442: cmap_loc[icol[j] - cstart] = j + 1;
1443: } else { /* use PetscTable for non-local col indices */
1444: PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES);
1445: }
1446: }
1447: } else {
1448: cmap = NULL;
1449: cmap_loc = NULL;
1450: }
1451: PetscCalloc1(C->rmap->n, &rmap_loc);
1452: #else
1453: if (!allcolumns) {
1454: PetscCalloc1(C->cmap->N, &cmap);
1455: for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1456: } else {
1457: cmap = NULL;
1458: }
1459: #endif
1461: /* Create lens for MatSeqAIJSetPreallocation() */
1462: PetscCalloc1(nrow, &lens);
1464: /* Compute lens from local part of C */
1465: for (j = 0; j < nrow; j++) {
1466: row = irow[j];
1467: proc = row2proc[j];
1468: if (proc == rank) {
1469: /* diagonal part A = c->A */
1470: ncols = ai[row - rstart + 1] - ai[row - rstart];
1471: cols = aj + ai[row - rstart];
1472: if (!allcolumns) {
1473: for (k = 0; k < ncols; k++) {
1474: #if defined(PETSC_USE_CTABLE)
1475: tcol = cmap_loc[cols[k]];
1476: #else
1477: tcol = cmap[cols[k] + cstart];
1478: #endif
1479: if (tcol) lens[j]++;
1480: }
1481: } else { /* allcolumns */
1482: lens[j] = ncols;
1483: }
1485: /* off-diagonal part B = c->B */
1486: ncols = bi[row - rstart + 1] - bi[row - rstart];
1487: cols = bj + bi[row - rstart];
1488: if (!allcolumns) {
1489: for (k = 0; k < ncols; k++) {
1490: #if defined(PETSC_USE_CTABLE)
1491: PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1492: #else
1493: tcol = cmap[bmap[cols[k]]];
1494: #endif
1495: if (tcol) lens[j]++;
1496: }
1497: } else { /* allcolumns */
1498: lens[j] += ncols;
1499: }
1500: }
1501: }
1503: /* Create row map (rmap): global row of C -> local row of submat */
1504: #if defined(PETSC_USE_CTABLE)
1505: PetscTableCreate(nrow, C->rmap->N, &rmap);
1506: for (j = 0; j < nrow; j++) {
1507: row = irow[j];
1508: proc = row2proc[j];
1509: if (proc == rank) { /* a local row */
1510: rmap_loc[row - rstart] = j;
1511: } else {
1512: PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES);
1513: }
1514: }
1515: #else
1516: PetscCalloc1(C->rmap->N, &rmap);
1517: for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1518: #endif
1520: /* Update lens from offproc data */
1521: /* recv a->j is done */
1522: MPI_Waitall(nrqs, r_waits3, r_status3);
1523: for (i = 0; i < nrqs; i++) {
1524: proc = pa[i];
1525: sbuf1_i = sbuf1[proc];
1527: ct1 = 2 + 1;
1528: ct2 = 0;
1529: rbuf2_i = rbuf2[i]; /* received length of C->j */
1530: rbuf3_i = rbuf3[i]; /* received C->j */
1533: max1 = sbuf1_i[2];
1534: for (k = 0; k < max1; k++, ct1++) {
1535: #if defined(PETSC_USE_CTABLE)
1536: PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row);
1537: row--;
1539: #else
1540: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1541: #endif
1542: /* Now, store row index of submat in sbuf1_i[ct1] */
1543: sbuf1_i[ct1] = row;
1545: nnz = rbuf2_i[ct1];
1546: if (!allcolumns) {
1547: for (l = 0; l < nnz; l++, ct2++) {
1548: #if defined(PETSC_USE_CTABLE)
1549: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1550: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1551: } else {
1552: PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1553: }
1554: #else
1555: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1556: #endif
1557: if (tcol) lens[row]++;
1558: }
1559: } else { /* allcolumns */
1560: lens[row] += nnz;
1561: }
1562: }
1563: }
1564: MPI_Waitall(nrqr, s_waits3, s_status3);
1565: PetscFree4(r_waits3, s_waits3, r_status3, s_status3);
1567: /* Create the submatrices */
1568: MatCreate(PETSC_COMM_SELF, &submat);
1569: MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE);
1571: ISGetBlockSize(isrow[0], &i);
1572: ISGetBlockSize(iscol[0], &j);
1573: MatSetBlockSizes(submat, i, j);
1574: MatSetType(submat, ((PetscObject)A)->type_name);
1575: MatSeqAIJSetPreallocation(submat, 0, lens);
1577: /* create struct Mat_SubSppt and attached it to submat */
1578: PetscNew(&smatis1);
1579: subc = (Mat_SeqAIJ *)submat->data;
1580: subc->submatis1 = smatis1;
1582: smatis1->id = 0;
1583: smatis1->nrqs = nrqs;
1584: smatis1->nrqr = nrqr;
1585: smatis1->rbuf1 = rbuf1;
1586: smatis1->rbuf2 = rbuf2;
1587: smatis1->rbuf3 = rbuf3;
1588: smatis1->sbuf2 = sbuf2;
1589: smatis1->req_source2 = req_source2;
1591: smatis1->sbuf1 = sbuf1;
1592: smatis1->ptr = ptr;
1593: smatis1->tmp = tmp;
1594: smatis1->ctr = ctr;
1596: smatis1->pa = pa;
1597: smatis1->req_size = req_size;
1598: smatis1->req_source1 = req_source1;
1600: smatis1->allcolumns = allcolumns;
1601: smatis1->singleis = PETSC_TRUE;
1602: smatis1->row2proc = row2proc;
1603: smatis1->rmap = rmap;
1604: smatis1->cmap = cmap;
1605: #if defined(PETSC_USE_CTABLE)
1606: smatis1->rmap_loc = rmap_loc;
1607: smatis1->cmap_loc = cmap_loc;
1608: #endif
1610: smatis1->destroy = submat->ops->destroy;
1611: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1612: submat->factortype = C->factortype;
1614: /* compute rmax */
1615: rmax = 0;
1616: for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1618: } else { /* scall == MAT_REUSE_MATRIX */
1619: submat = submats[0];
1622: subc = (Mat_SeqAIJ *)submat->data;
1623: rmax = subc->rmax;
1624: smatis1 = subc->submatis1;
1625: nrqs = smatis1->nrqs;
1626: nrqr = smatis1->nrqr;
1627: rbuf1 = smatis1->rbuf1;
1628: rbuf2 = smatis1->rbuf2;
1629: rbuf3 = smatis1->rbuf3;
1630: req_source2 = smatis1->req_source2;
1632: sbuf1 = smatis1->sbuf1;
1633: sbuf2 = smatis1->sbuf2;
1634: ptr = smatis1->ptr;
1635: tmp = smatis1->tmp;
1636: ctr = smatis1->ctr;
1638: pa = smatis1->pa;
1639: req_size = smatis1->req_size;
1640: req_source1 = smatis1->req_source1;
1642: allcolumns = smatis1->allcolumns;
1643: row2proc = smatis1->row2proc;
1644: rmap = smatis1->rmap;
1645: cmap = smatis1->cmap;
1646: #if defined(PETSC_USE_CTABLE)
1647: rmap_loc = smatis1->rmap_loc;
1648: cmap_loc = smatis1->cmap_loc;
1649: #endif
1650: }
1652: /* Post recv matrix values */
1653: PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals);
1654: PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4);
1655: PetscObjectGetNewTag((PetscObject)C, &tag4);
1656: for (i = 0; i < nrqs; ++i) {
1657: PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
1658: MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
1659: }
1661: /* Allocate sending buffers for a->a, and send them off */
1662: PetscMalloc1(nrqr, &sbuf_aa);
1663: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1664: if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
1665: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1667: for (i = 0; i < nrqr; i++) {
1668: rbuf1_i = rbuf1[i];
1669: sbuf_aa_i = sbuf_aa[i];
1670: ct1 = 2 * rbuf1_i[0] + 1;
1671: ct2 = 0;
1674: kmax = rbuf1_i[2];
1675: for (k = 0; k < kmax; k++, ct1++) {
1676: row = rbuf1_i[ct1] - rstart;
1677: nzA = ai[row + 1] - ai[row];
1678: nzB = bi[row + 1] - bi[row];
1679: ncols = nzA + nzB;
1680: cworkB = bj + bi[row];
1681: vworkA = a_a + ai[row];
1682: vworkB = b_a + bi[row];
1684: /* load the column values for this row into vals*/
1685: vals = sbuf_aa_i + ct2;
1687: lwrite = 0;
1688: for (l = 0; l < nzB; l++) {
1689: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1690: }
1691: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1692: for (l = 0; l < nzB; l++) {
1693: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1694: }
1696: ct2 += ncols;
1697: }
1698: MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
1699: }
1701: /* Assemble submat */
1702: /* First assemble the local rows */
1703: for (j = 0; j < nrow; j++) {
1704: row = irow[j];
1705: proc = row2proc[j];
1706: if (proc == rank) {
1707: Crow = row - rstart; /* local row index of C */
1708: #if defined(PETSC_USE_CTABLE)
1709: row = rmap_loc[Crow]; /* row index of submat */
1710: #else
1711: row = rmap[row];
1712: #endif
1714: if (allcolumns) {
1715: /* diagonal part A = c->A */
1716: ncols = ai[Crow + 1] - ai[Crow];
1717: cols = aj + ai[Crow];
1718: vals = a_a + ai[Crow];
1719: i = 0;
1720: for (k = 0; k < ncols; k++) {
1721: subcols[i] = cols[k] + cstart;
1722: subvals[i++] = vals[k];
1723: }
1725: /* off-diagonal part B = c->B */
1726: ncols = bi[Crow + 1] - bi[Crow];
1727: cols = bj + bi[Crow];
1728: vals = b_a + bi[Crow];
1729: for (k = 0; k < ncols; k++) {
1730: subcols[i] = bmap[cols[k]];
1731: subvals[i++] = vals[k];
1732: }
1734: MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1736: } else { /* !allcolumns */
1737: #if defined(PETSC_USE_CTABLE)
1738: /* diagonal part A = c->A */
1739: ncols = ai[Crow + 1] - ai[Crow];
1740: cols = aj + ai[Crow];
1741: vals = a_a + ai[Crow];
1742: i = 0;
1743: for (k = 0; k < ncols; k++) {
1744: tcol = cmap_loc[cols[k]];
1745: if (tcol) {
1746: subcols[i] = --tcol;
1747: subvals[i++] = vals[k];
1748: }
1749: }
1751: /* off-diagonal part B = c->B */
1752: ncols = bi[Crow + 1] - bi[Crow];
1753: cols = bj + bi[Crow];
1754: vals = b_a + bi[Crow];
1755: for (k = 0; k < ncols; k++) {
1756: PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol);
1757: if (tcol) {
1758: subcols[i] = --tcol;
1759: subvals[i++] = vals[k];
1760: }
1761: }
1762: #else
1763: /* diagonal part A = c->A */
1764: ncols = ai[Crow + 1] - ai[Crow];
1765: cols = aj + ai[Crow];
1766: vals = a_a + ai[Crow];
1767: i = 0;
1768: for (k = 0; k < ncols; k++) {
1769: tcol = cmap[cols[k] + cstart];
1770: if (tcol) {
1771: subcols[i] = --tcol;
1772: subvals[i++] = vals[k];
1773: }
1774: }
1776: /* off-diagonal part B = c->B */
1777: ncols = bi[Crow + 1] - bi[Crow];
1778: cols = bj + bi[Crow];
1779: vals = b_a + bi[Crow];
1780: for (k = 0; k < ncols; k++) {
1781: tcol = cmap[bmap[cols[k]]];
1782: if (tcol) {
1783: subcols[i] = --tcol;
1784: subvals[i++] = vals[k];
1785: }
1786: }
1787: #endif
1788: MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES);
1789: }
1790: }
1791: }
1793: /* Now assemble the off-proc rows */
1794: for (i = 0; i < nrqs; i++) { /* for each requested message */
1795: /* recv values from other processes */
1796: MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i);
1797: proc = pa[idex];
1798: sbuf1_i = sbuf1[proc];
1800: ct1 = 2 + 1;
1801: ct2 = 0; /* count of received C->j */
1802: ct3 = 0; /* count of received C->j that will be inserted into submat */
1803: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1804: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1805: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1808: max1 = sbuf1_i[2]; /* num of rows */
1809: for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1810: row = sbuf1_i[ct1]; /* row index of submat */
1811: if (!allcolumns) {
1812: idex = 0;
1813: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1814: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1815: for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1816: #if defined(PETSC_USE_CTABLE)
1817: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1818: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1819: } else {
1820: PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol);
1821: }
1822: #else
1823: tcol = cmap[rbuf3_i[ct2]];
1824: #endif
1825: if (tcol) {
1826: subcols[idex] = --tcol; /* may not be sorted */
1827: subvals[idex++] = rbuf4_i[ct2];
1829: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1830: For reuse, we replace received C->j with index that should be inserted to submat */
1831: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1832: }
1833: }
1834: MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES);
1835: } else { /* scall == MAT_REUSE_MATRIX */
1836: submat = submats[0];
1837: subc = (Mat_SeqAIJ *)submat->data;
1839: nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1840: for (l = 0; l < nnz; l++) {
1841: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1842: subvals[idex++] = rbuf4_i[ct2];
1843: }
1845: bj = subc->j + subc->i[row]; /* sorted column indices */
1846: MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES);
1847: }
1848: } else { /* allcolumns */
1849: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1850: MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES);
1851: ct2 += nnz;
1852: }
1853: }
1854: }
1856: /* sending a->a are done */
1857: MPI_Waitall(nrqr, s_waits4, s_status4);
1858: PetscFree4(r_waits4, s_waits4, r_status4, s_status4);
1860: MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY);
1861: MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY);
1862: submats[0] = submat;
1864: /* Restore the indices */
1865: ISRestoreIndices(isrow[0], &irow);
1866: if (!allcolumns) ISRestoreIndices(iscol[0], &icol);
1868: /* Destroy allocated memory */
1869: for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
1870: PetscFree3(rbuf4, subcols, subvals);
1871: if (sbuf_aa) {
1872: PetscFree(sbuf_aa[0]);
1873: PetscFree(sbuf_aa);
1874: }
1876: if (scall == MAT_INITIAL_MATRIX) {
1877: PetscFree(lens);
1878: if (sbuf_aj) {
1879: PetscFree(sbuf_aj[0]);
1880: PetscFree(sbuf_aj);
1881: }
1882: }
1883: MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
1884: MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a);
1885: return 0;
1886: }
1888: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1889: {
1890: PetscInt ncol;
1891: PetscBool colflag, allcolumns = PETSC_FALSE;
1893: /* Allocate memory to hold all the submatrices */
1894: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(2, submat);
1896: /* Check for special case: each processor gets entire matrix columns */
1897: ISIdentity(iscol[0], &colflag);
1898: ISGetLocalSize(iscol[0], &ncol);
1899: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1901: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat);
1902: return 0;
1903: }
1905: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1906: {
1907: PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1908: PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE;
1909: Mat_SeqAIJ *subc;
1910: Mat_SubSppt *smat;
1912: /* Check for special case: each processor has a single IS */
1913: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1914: MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1915: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1916: return 0;
1917: }
1919: /* Collect global wantallmatrix and nstages */
1920: if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1921: else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1922: if (!nmax) nmax = 1;
1924: if (scall == MAT_INITIAL_MATRIX) {
1925: /* Collect global wantallmatrix and nstages */
1926: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1927: ISIdentity(*isrow, &rowflag);
1928: ISIdentity(*iscol, &colflag);
1929: ISGetLocalSize(*isrow, &nrow);
1930: ISGetLocalSize(*iscol, &ncol);
1931: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1932: wantallmatrix = PETSC_TRUE;
1934: PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL);
1935: }
1936: }
1938: /* Determine the number of stages through which submatrices are done
1939: Each stage will extract nmax submatrices.
1940: nmax is determined by the matrix column dimension.
1941: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1942: */
1943: nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1945: in[0] = -1 * (PetscInt)wantallmatrix;
1946: in[1] = nstages;
1947: MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
1948: wantallmatrix = (PetscBool)(-out[0]);
1949: nstages = out[1]; /* Make sure every processor loops through the global nstages */
1951: } else { /* MAT_REUSE_MATRIX */
1952: if (ismax) {
1953: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1954: smat = subc->submatis1;
1955: } else { /* (*submat)[0] is a dummy matrix */
1956: smat = (Mat_SubSppt *)(*submat)[0]->data;
1957: }
1958: if (!smat) {
1959: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1960: wantallmatrix = PETSC_TRUE;
1961: } else if (smat->singleis) {
1962: MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat);
1963: return 0;
1964: } else {
1965: nstages = smat->nstages;
1966: }
1967: }
1969: if (wantallmatrix) {
1970: MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat);
1971: return 0;
1972: }
1974: /* Allocate memory to hold all the submatrices and dummy submatrices */
1975: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(ismax + nstages, submat);
1977: for (i = 0, pos = 0; i < nstages; i++) {
1978: if (pos + nmax <= ismax) max_no = nmax;
1979: else if (pos >= ismax) max_no = 0;
1980: else max_no = ismax - pos;
1982: MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos);
1983: if (!max_no) {
1984: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
1985: smat = (Mat_SubSppt *)(*submat)[pos]->data;
1986: smat->nstages = nstages;
1987: }
1988: pos++; /* advance to next dummy matrix if any */
1989: } else pos += max_no;
1990: }
1992: if (ismax && scall == MAT_INITIAL_MATRIX) {
1993: /* save nstages for reuse */
1994: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1995: smat = subc->submatis1;
1996: smat->nstages = nstages;
1997: }
1998: return 0;
1999: }
2001: /* -------------------------------------------------------------------------*/
2002: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2003: {
2004: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
2005: Mat A = c->A;
2006: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2007: const PetscInt **icol, **irow;
2008: PetscInt *nrow, *ncol, start;
2009: PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2010: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2011: PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2012: PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2013: PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2014: #if defined(PETSC_USE_CTABLE)
2015: PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i;
2016: #else
2017: PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2018: #endif
2019: const PetscInt *irow_i;
2020: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2021: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2022: MPI_Request *r_waits4, *s_waits3, *s_waits4;
2023: MPI_Comm comm;
2024: PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2025: PetscMPIInt *onodes1, *olengths1, end;
2026: PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2027: Mat_SubSppt *smat_i;
2028: PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2029: PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2031: PetscObjectGetComm((PetscObject)C, &comm);
2032: size = c->size;
2033: rank = c->rank;
2035: PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns);
2036: PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted);
2038: for (i = 0; i < ismax; i++) {
2039: ISSorted(iscol[i], &issorted[i]);
2040: if (!issorted[i]) iscsorted = issorted[i];
2042: ISSorted(isrow[i], &issorted[i]);
2044: ISGetIndices(isrow[i], &irow[i]);
2045: ISGetLocalSize(isrow[i], &nrow[i]);
2047: /* Check for special case: allcolumn */
2048: ISIdentity(iscol[i], &colflag);
2049: ISGetLocalSize(iscol[i], &ncol[i]);
2050: if (colflag && ncol[i] == C->cmap->N) {
2051: allcolumns[i] = PETSC_TRUE;
2052: icol[i] = NULL;
2053: } else {
2054: allcolumns[i] = PETSC_FALSE;
2055: ISGetIndices(iscol[i], &icol[i]);
2056: }
2057: }
2059: if (scall == MAT_REUSE_MATRIX) {
2060: /* Assumes new rows are same length as the old rows */
2061: for (i = 0; i < ismax; i++) {
2063: subc = (Mat_SeqAIJ *)submats[i]->data;
2066: /* Initial matrix as if empty */
2067: PetscArrayzero(subc->ilen, submats[i]->rmap->n);
2069: smat_i = subc->submatis1;
2071: nrqs = smat_i->nrqs;
2072: nrqr = smat_i->nrqr;
2073: rbuf1 = smat_i->rbuf1;
2074: rbuf2 = smat_i->rbuf2;
2075: rbuf3 = smat_i->rbuf3;
2076: req_source2 = smat_i->req_source2;
2078: sbuf1 = smat_i->sbuf1;
2079: sbuf2 = smat_i->sbuf2;
2080: ptr = smat_i->ptr;
2081: tmp = smat_i->tmp;
2082: ctr = smat_i->ctr;
2084: pa = smat_i->pa;
2085: req_size = smat_i->req_size;
2086: req_source1 = smat_i->req_source1;
2088: allcolumns[i] = smat_i->allcolumns;
2089: row2proc[i] = smat_i->row2proc;
2090: rmap[i] = smat_i->rmap;
2091: cmap[i] = smat_i->cmap;
2092: }
2094: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2096: smat_i = (Mat_SubSppt *)submats[0]->data;
2098: nrqs = smat_i->nrqs;
2099: nrqr = smat_i->nrqr;
2100: rbuf1 = smat_i->rbuf1;
2101: rbuf2 = smat_i->rbuf2;
2102: rbuf3 = smat_i->rbuf3;
2103: req_source2 = smat_i->req_source2;
2105: sbuf1 = smat_i->sbuf1;
2106: sbuf2 = smat_i->sbuf2;
2107: ptr = smat_i->ptr;
2108: tmp = smat_i->tmp;
2109: ctr = smat_i->ctr;
2111: pa = smat_i->pa;
2112: req_size = smat_i->req_size;
2113: req_source1 = smat_i->req_source1;
2115: allcolumns[0] = PETSC_FALSE;
2116: }
2117: } else { /* scall == MAT_INITIAL_MATRIX */
2118: /* Get some new tags to keep the communication clean */
2119: PetscObjectGetNewTag((PetscObject)C, &tag2);
2120: PetscObjectGetNewTag((PetscObject)C, &tag3);
2122: /* evaluate communication - mesg to who, length of mesg, and buffer space
2123: required. Based on this, buffers are allocated, and data copied into them*/
2124: PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4); /* mesg size, initialize work vectors */
2126: for (i = 0; i < ismax; i++) {
2127: jmax = nrow[i];
2128: irow_i = irow[i];
2130: PetscMalloc1(jmax, &row2proc_i);
2131: row2proc[i] = row2proc_i;
2133: if (issorted[i]) proc = 0;
2134: for (j = 0; j < jmax; j++) {
2135: if (!issorted[i]) proc = 0;
2136: row = irow_i[j];
2137: while (row >= C->rmap->range[proc + 1]) proc++;
2138: w4[proc]++;
2139: row2proc_i[j] = proc; /* map row index to proc */
2140: }
2141: for (j = 0; j < size; j++) {
2142: if (w4[j]) {
2143: w1[j] += w4[j];
2144: w3[j]++;
2145: w4[j] = 0;
2146: }
2147: }
2148: }
2150: nrqs = 0; /* no of outgoing messages */
2151: msz = 0; /* total mesg length (for all procs) */
2152: w1[rank] = 0; /* no mesg sent to self */
2153: w3[rank] = 0;
2154: for (i = 0; i < size; i++) {
2155: if (w1[i]) {
2156: w2[i] = 1;
2157: nrqs++;
2158: } /* there exists a message to proc i */
2159: }
2160: PetscMalloc1(nrqs, &pa); /*(proc -array)*/
2161: for (i = 0, j = 0; i < size; i++) {
2162: if (w1[i]) {
2163: pa[j] = i;
2164: j++;
2165: }
2166: }
2168: /* Each message would have a header = 1 + 2*(no of IS) + data */
2169: for (i = 0; i < nrqs; i++) {
2170: j = pa[i];
2171: w1[j] += w2[j] + 2 * w3[j];
2172: msz += w1[j];
2173: }
2174: PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz);
2176: /* Determine the number of messages to expect, their lengths, from from-ids */
2177: PetscGatherNumberOfMessages(comm, w2, w1, &nrqr);
2178: PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1);
2180: /* Now post the Irecvs corresponding to these messages */
2181: PetscObjectGetNewTag((PetscObject)C, &tag0);
2182: PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1);
2184: /* Allocate Memory for outgoing messages */
2185: PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr);
2186: PetscArrayzero(sbuf1, size);
2187: PetscArrayzero(ptr, size);
2189: {
2190: PetscInt *iptr = tmp;
2191: k = 0;
2192: for (i = 0; i < nrqs; i++) {
2193: j = pa[i];
2194: iptr += k;
2195: sbuf1[j] = iptr;
2196: k = w1[j];
2197: }
2198: }
2200: /* Form the outgoing messages. Initialize the header space */
2201: for (i = 0; i < nrqs; i++) {
2202: j = pa[i];
2203: sbuf1[j][0] = 0;
2204: PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]);
2205: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2206: }
2208: /* Parse the isrow and copy data into outbuf */
2209: for (i = 0; i < ismax; i++) {
2210: row2proc_i = row2proc[i];
2211: PetscArrayzero(ctr, size);
2212: irow_i = irow[i];
2213: jmax = nrow[i];
2214: for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2215: proc = row2proc_i[j];
2216: if (proc != rank) { /* copy to the outgoing buf*/
2217: ctr[proc]++;
2218: *ptr[proc] = irow_i[j];
2219: ptr[proc]++;
2220: }
2221: }
2222: /* Update the headers for the current IS */
2223: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2224: if ((ctr_j = ctr[j])) {
2225: sbuf1_j = sbuf1[j];
2226: k = ++sbuf1_j[0];
2227: sbuf1_j[2 * k] = ctr_j;
2228: sbuf1_j[2 * k - 1] = i;
2229: }
2230: }
2231: }
2233: /* Now post the sends */
2234: PetscMalloc1(nrqs, &s_waits1);
2235: for (i = 0; i < nrqs; ++i) {
2236: j = pa[i];
2237: MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i);
2238: }
2240: /* Post Receives to capture the buffer size */
2241: PetscMalloc1(nrqs, &r_waits2);
2242: PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3);
2243: if (nrqs) rbuf2[0] = tmp + msz;
2244: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2245: for (i = 0; i < nrqs; ++i) {
2246: j = pa[i];
2247: MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i);
2248: }
2250: /* Send to other procs the buf size they should allocate */
2251: /* Receive messages*/
2252: PetscMalloc1(nrqr, &s_waits2);
2253: PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1);
2254: {
2255: PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2256: PetscInt *sbuf2_i;
2258: MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE);
2259: for (i = 0; i < nrqr; ++i) {
2260: req_size[i] = 0;
2261: rbuf1_i = rbuf1[i];
2262: start = 2 * rbuf1_i[0] + 1;
2263: end = olengths1[i];
2264: PetscMalloc1(end, &sbuf2[i]);
2265: sbuf2_i = sbuf2[i];
2266: for (j = start; j < end; j++) {
2267: id = rbuf1_i[j] - rstart;
2268: ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2269: sbuf2_i[j] = ncols;
2270: req_size[i] += ncols;
2271: }
2272: req_source1[i] = onodes1[i];
2273: /* form the header */
2274: sbuf2_i[0] = req_size[i];
2275: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2277: MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i);
2278: }
2279: }
2281: PetscFree(onodes1);
2282: PetscFree(olengths1);
2283: PetscFree(r_waits1);
2284: PetscFree4(w1, w2, w3, w4);
2286: /* Receive messages*/
2287: PetscMalloc1(nrqs, &r_waits3);
2288: MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE);
2289: for (i = 0; i < nrqs; ++i) {
2290: PetscMalloc1(rbuf2[i][0], &rbuf3[i]);
2291: req_source2[i] = pa[i];
2292: MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i);
2293: }
2294: PetscFree(r_waits2);
2296: /* Wait on sends1 and sends2 */
2297: MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE);
2298: MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE);
2299: PetscFree(s_waits1);
2300: PetscFree(s_waits2);
2302: /* Now allocate sending buffers for a->j, and send them off */
2303: PetscMalloc1(nrqr, &sbuf_aj);
2304: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2305: if (nrqr) PetscMalloc1(j, &sbuf_aj[0]);
2306: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2308: PetscMalloc1(nrqr, &s_waits3);
2309: {
2310: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2311: PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2312: PetscInt cend = C->cmap->rend;
2313: PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2315: for (i = 0; i < nrqr; i++) {
2316: rbuf1_i = rbuf1[i];
2317: sbuf_aj_i = sbuf_aj[i];
2318: ct1 = 2 * rbuf1_i[0] + 1;
2319: ct2 = 0;
2320: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2321: kmax = rbuf1[i][2 * j];
2322: for (k = 0; k < kmax; k++, ct1++) {
2323: row = rbuf1_i[ct1] - rstart;
2324: nzA = a_i[row + 1] - a_i[row];
2325: nzB = b_i[row + 1] - b_i[row];
2326: ncols = nzA + nzB;
2327: cworkA = a_j + a_i[row];
2328: cworkB = b_j + b_i[row];
2330: /* load the column indices for this row into cols */
2331: cols = sbuf_aj_i + ct2;
2333: lwrite = 0;
2334: for (l = 0; l < nzB; l++) {
2335: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2336: }
2337: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2338: for (l = 0; l < nzB; l++) {
2339: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2340: }
2342: ct2 += ncols;
2343: }
2344: }
2345: MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i);
2346: }
2347: }
2349: /* create col map: global col of C -> local col of submatrices */
2350: {
2351: const PetscInt *icol_i;
2352: #if defined(PETSC_USE_CTABLE)
2353: for (i = 0; i < ismax; i++) {
2354: if (!allcolumns[i]) {
2355: PetscTableCreate(ncol[i], C->cmap->N, &cmap[i]);
2357: jmax = ncol[i];
2358: icol_i = icol[i];
2359: cmap_i = cmap[i];
2360: for (j = 0; j < jmax; j++) PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES);
2361: } else cmap[i] = NULL;
2362: }
2363: #else
2364: for (i = 0; i < ismax; i++) {
2365: if (!allcolumns[i]) {
2366: PetscCalloc1(C->cmap->N, &cmap[i]);
2367: jmax = ncol[i];
2368: icol_i = icol[i];
2369: cmap_i = cmap[i];
2370: for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2371: } else cmap[i] = NULL;
2372: }
2373: #endif
2374: }
2376: /* Create lens which is required for MatCreate... */
2377: for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2378: PetscMalloc1(ismax, &lens);
2380: if (ismax) PetscCalloc1(j, &lens[0]);
2381: for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2383: /* Update lens from local data */
2384: for (i = 0; i < ismax; i++) {
2385: row2proc_i = row2proc[i];
2386: jmax = nrow[i];
2387: if (!allcolumns[i]) cmap_i = cmap[i];
2388: irow_i = irow[i];
2389: lens_i = lens[i];
2390: for (j = 0; j < jmax; j++) {
2391: row = irow_i[j];
2392: proc = row2proc_i[j];
2393: if (proc == rank) {
2394: MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2395: if (!allcolumns[i]) {
2396: for (k = 0; k < ncols; k++) {
2397: #if defined(PETSC_USE_CTABLE)
2398: PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2399: #else
2400: tcol = cmap_i[cols[k]];
2401: #endif
2402: if (tcol) lens_i[j]++;
2403: }
2404: } else { /* allcolumns */
2405: lens_i[j] = ncols;
2406: }
2407: MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL);
2408: }
2409: }
2410: }
2412: /* Create row map: global row of C -> local row of submatrices */
2413: #if defined(PETSC_USE_CTABLE)
2414: for (i = 0; i < ismax; i++) {
2415: PetscTableCreate(nrow[i], C->rmap->N, &rmap[i]);
2416: irow_i = irow[i];
2417: jmax = nrow[i];
2418: for (j = 0; j < jmax; j++) PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES);
2419: }
2420: #else
2421: for (i = 0; i < ismax; i++) {
2422: PetscCalloc1(C->rmap->N, &rmap[i]);
2423: rmap_i = rmap[i];
2424: irow_i = irow[i];
2425: jmax = nrow[i];
2426: for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2427: }
2428: #endif
2430: /* Update lens from offproc data */
2431: {
2432: PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2434: MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE);
2435: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2436: sbuf1_i = sbuf1[pa[tmp2]];
2437: jmax = sbuf1_i[0];
2438: ct1 = 2 * jmax + 1;
2439: ct2 = 0;
2440: rbuf2_i = rbuf2[tmp2];
2441: rbuf3_i = rbuf3[tmp2];
2442: for (j = 1; j <= jmax; j++) {
2443: is_no = sbuf1_i[2 * j - 1];
2444: max1 = sbuf1_i[2 * j];
2445: lens_i = lens[is_no];
2446: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2447: rmap_i = rmap[is_no];
2448: for (k = 0; k < max1; k++, ct1++) {
2449: #if defined(PETSC_USE_CTABLE)
2450: PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row);
2451: row--;
2453: #else
2454: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2455: #endif
2456: max2 = rbuf2_i[ct1];
2457: for (l = 0; l < max2; l++, ct2++) {
2458: if (!allcolumns[is_no]) {
2459: #if defined(PETSC_USE_CTABLE)
2460: PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2461: #else
2462: tcol = cmap_i[rbuf3_i[ct2]];
2463: #endif
2464: if (tcol) lens_i[row]++;
2465: } else { /* allcolumns */
2466: lens_i[row]++; /* lens_i[row] += max2 ? */
2467: }
2468: }
2469: }
2470: }
2471: }
2472: }
2473: PetscFree(r_waits3);
2474: MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE);
2475: PetscFree(s_waits3);
2477: /* Create the submatrices */
2478: for (i = 0; i < ismax; i++) {
2479: PetscInt rbs, cbs;
2481: ISGetBlockSize(isrow[i], &rbs);
2482: ISGetBlockSize(iscol[i], &cbs);
2484: MatCreate(PETSC_COMM_SELF, submats + i);
2485: MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE);
2487: MatSetBlockSizes(submats[i], rbs, cbs);
2488: MatSetType(submats[i], ((PetscObject)A)->type_name);
2489: MatSeqAIJSetPreallocation(submats[i], 0, lens[i]);
2491: /* create struct Mat_SubSppt and attached it to submat */
2492: PetscNew(&smat_i);
2493: subc = (Mat_SeqAIJ *)submats[i]->data;
2494: subc->submatis1 = smat_i;
2496: smat_i->destroy = submats[i]->ops->destroy;
2497: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2498: submats[i]->factortype = C->factortype;
2500: smat_i->id = i;
2501: smat_i->nrqs = nrqs;
2502: smat_i->nrqr = nrqr;
2503: smat_i->rbuf1 = rbuf1;
2504: smat_i->rbuf2 = rbuf2;
2505: smat_i->rbuf3 = rbuf3;
2506: smat_i->sbuf2 = sbuf2;
2507: smat_i->req_source2 = req_source2;
2509: smat_i->sbuf1 = sbuf1;
2510: smat_i->ptr = ptr;
2511: smat_i->tmp = tmp;
2512: smat_i->ctr = ctr;
2514: smat_i->pa = pa;
2515: smat_i->req_size = req_size;
2516: smat_i->req_source1 = req_source1;
2518: smat_i->allcolumns = allcolumns[i];
2519: smat_i->singleis = PETSC_FALSE;
2520: smat_i->row2proc = row2proc[i];
2521: smat_i->rmap = rmap[i];
2522: smat_i->cmap = cmap[i];
2523: }
2525: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2526: MatCreate(PETSC_COMM_SELF, &submats[0]);
2527: MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE);
2528: MatSetType(submats[0], MATDUMMY);
2530: /* create struct Mat_SubSppt and attached it to submat */
2531: PetscNew(&smat_i);
2532: submats[0]->data = (void *)smat_i;
2534: smat_i->destroy = submats[0]->ops->destroy;
2535: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2536: submats[0]->factortype = C->factortype;
2538: smat_i->id = 0;
2539: smat_i->nrqs = nrqs;
2540: smat_i->nrqr = nrqr;
2541: smat_i->rbuf1 = rbuf1;
2542: smat_i->rbuf2 = rbuf2;
2543: smat_i->rbuf3 = rbuf3;
2544: smat_i->sbuf2 = sbuf2;
2545: smat_i->req_source2 = req_source2;
2547: smat_i->sbuf1 = sbuf1;
2548: smat_i->ptr = ptr;
2549: smat_i->tmp = tmp;
2550: smat_i->ctr = ctr;
2552: smat_i->pa = pa;
2553: smat_i->req_size = req_size;
2554: smat_i->req_source1 = req_source1;
2556: smat_i->allcolumns = PETSC_FALSE;
2557: smat_i->singleis = PETSC_FALSE;
2558: smat_i->row2proc = NULL;
2559: smat_i->rmap = NULL;
2560: smat_i->cmap = NULL;
2561: }
2563: if (ismax) PetscFree(lens[0]);
2564: PetscFree(lens);
2565: if (sbuf_aj) {
2566: PetscFree(sbuf_aj[0]);
2567: PetscFree(sbuf_aj);
2568: }
2570: } /* endof scall == MAT_INITIAL_MATRIX */
2572: /* Post recv matrix values */
2573: PetscObjectGetNewTag((PetscObject)C, &tag4);
2574: PetscMalloc1(nrqs, &rbuf4);
2575: PetscMalloc1(nrqs, &r_waits4);
2576: for (i = 0; i < nrqs; ++i) {
2577: PetscMalloc1(rbuf2[i][0], &rbuf4[i]);
2578: MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i);
2579: }
2581: /* Allocate sending buffers for a->a, and send them off */
2582: PetscMalloc1(nrqr, &sbuf_aa);
2583: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2584: if (nrqr) PetscMalloc1(j, &sbuf_aa[0]);
2585: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2587: PetscMalloc1(nrqr, &s_waits4);
2588: {
2589: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2590: PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2591: PetscInt cend = C->cmap->rend;
2592: PetscInt *b_j = b->j;
2593: PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2595: MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a);
2596: MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a);
2597: for (i = 0; i < nrqr; i++) {
2598: rbuf1_i = rbuf1[i];
2599: sbuf_aa_i = sbuf_aa[i];
2600: ct1 = 2 * rbuf1_i[0] + 1;
2601: ct2 = 0;
2602: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2603: kmax = rbuf1_i[2 * j];
2604: for (k = 0; k < kmax; k++, ct1++) {
2605: row = rbuf1_i[ct1] - rstart;
2606: nzA = a_i[row + 1] - a_i[row];
2607: nzB = b_i[row + 1] - b_i[row];
2608: ncols = nzA + nzB;
2609: cworkB = b_j + b_i[row];
2610: vworkA = a_a + a_i[row];
2611: vworkB = b_a + b_i[row];
2613: /* load the column values for this row into vals*/
2614: vals = sbuf_aa_i + ct2;
2616: lwrite = 0;
2617: for (l = 0; l < nzB; l++) {
2618: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2619: }
2620: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2621: for (l = 0; l < nzB; l++) {
2622: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2623: }
2625: ct2 += ncols;
2626: }
2627: }
2628: MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i);
2629: }
2630: MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a);
2631: MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a);
2632: }
2634: /* Assemble the matrices */
2635: /* First assemble the local rows */
2636: for (i = 0; i < ismax; i++) {
2637: row2proc_i = row2proc[i];
2638: subc = (Mat_SeqAIJ *)submats[i]->data;
2639: imat_ilen = subc->ilen;
2640: imat_j = subc->j;
2641: imat_i = subc->i;
2642: imat_a = subc->a;
2644: if (!allcolumns[i]) cmap_i = cmap[i];
2645: rmap_i = rmap[i];
2646: irow_i = irow[i];
2647: jmax = nrow[i];
2648: for (j = 0; j < jmax; j++) {
2649: row = irow_i[j];
2650: proc = row2proc_i[j];
2651: if (proc == rank) {
2652: old_row = row;
2653: #if defined(PETSC_USE_CTABLE)
2654: PetscTableFind(rmap_i, row + 1, &row);
2655: row--;
2656: #else
2657: row = rmap_i[row];
2658: #endif
2659: ilen_row = imat_ilen[row];
2660: MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2661: mat_i = imat_i[row];
2662: mat_a = imat_a + mat_i;
2663: mat_j = imat_j + mat_i;
2664: if (!allcolumns[i]) {
2665: for (k = 0; k < ncols; k++) {
2666: #if defined(PETSC_USE_CTABLE)
2667: PetscTableFind(cmap_i, cols[k] + 1, &tcol);
2668: #else
2669: tcol = cmap_i[cols[k]];
2670: #endif
2671: if (tcol) {
2672: *mat_j++ = tcol - 1;
2673: *mat_a++ = vals[k];
2674: ilen_row++;
2675: }
2676: }
2677: } else { /* allcolumns */
2678: for (k = 0; k < ncols; k++) {
2679: *mat_j++ = cols[k]; /* global col index! */
2680: *mat_a++ = vals[k];
2681: ilen_row++;
2682: }
2683: }
2684: MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals);
2686: imat_ilen[row] = ilen_row;
2687: }
2688: }
2689: }
2691: /* Now assemble the off proc rows */
2692: MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE);
2693: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2694: sbuf1_i = sbuf1[pa[tmp2]];
2695: jmax = sbuf1_i[0];
2696: ct1 = 2 * jmax + 1;
2697: ct2 = 0;
2698: rbuf2_i = rbuf2[tmp2];
2699: rbuf3_i = rbuf3[tmp2];
2700: rbuf4_i = rbuf4[tmp2];
2701: for (j = 1; j <= jmax; j++) {
2702: is_no = sbuf1_i[2 * j - 1];
2703: rmap_i = rmap[is_no];
2704: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2705: subc = (Mat_SeqAIJ *)submats[is_no]->data;
2706: imat_ilen = subc->ilen;
2707: imat_j = subc->j;
2708: imat_i = subc->i;
2709: imat_a = subc->a;
2710: max1 = sbuf1_i[2 * j];
2711: for (k = 0; k < max1; k++, ct1++) {
2712: row = sbuf1_i[ct1];
2713: #if defined(PETSC_USE_CTABLE)
2714: PetscTableFind(rmap_i, row + 1, &row);
2715: row--;
2716: #else
2717: row = rmap_i[row];
2718: #endif
2719: ilen = imat_ilen[row];
2720: mat_i = imat_i[row];
2721: mat_a = imat_a + mat_i;
2722: mat_j = imat_j + mat_i;
2723: max2 = rbuf2_i[ct1];
2724: if (!allcolumns[is_no]) {
2725: for (l = 0; l < max2; l++, ct2++) {
2726: #if defined(PETSC_USE_CTABLE)
2727: PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol);
2728: #else
2729: tcol = cmap_i[rbuf3_i[ct2]];
2730: #endif
2731: if (tcol) {
2732: *mat_j++ = tcol - 1;
2733: *mat_a++ = rbuf4_i[ct2];
2734: ilen++;
2735: }
2736: }
2737: } else { /* allcolumns */
2738: for (l = 0; l < max2; l++, ct2++) {
2739: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2740: *mat_a++ = rbuf4_i[ct2];
2741: ilen++;
2742: }
2743: }
2744: imat_ilen[row] = ilen;
2745: }
2746: }
2747: }
2749: if (!iscsorted) { /* sort column indices of the rows */
2750: for (i = 0; i < ismax; i++) {
2751: subc = (Mat_SeqAIJ *)submats[i]->data;
2752: imat_j = subc->j;
2753: imat_i = subc->i;
2754: imat_a = subc->a;
2755: imat_ilen = subc->ilen;
2757: if (allcolumns[i]) continue;
2758: jmax = nrow[i];
2759: for (j = 0; j < jmax; j++) {
2760: mat_i = imat_i[j];
2761: mat_a = imat_a + mat_i;
2762: mat_j = imat_j + mat_i;
2763: PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a);
2764: }
2765: }
2766: }
2768: PetscFree(r_waits4);
2769: MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE);
2770: PetscFree(s_waits4);
2772: /* Restore the indices */
2773: for (i = 0; i < ismax; i++) {
2774: ISRestoreIndices(isrow[i], irow + i);
2775: if (!allcolumns[i]) ISRestoreIndices(iscol[i], icol + i);
2776: }
2778: for (i = 0; i < ismax; i++) {
2779: MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY);
2780: MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY);
2781: }
2783: /* Destroy allocated memory */
2784: if (sbuf_aa) {
2785: PetscFree(sbuf_aa[0]);
2786: PetscFree(sbuf_aa);
2787: }
2788: PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted);
2790: for (i = 0; i < nrqs; ++i) PetscFree(rbuf4[i]);
2791: PetscFree(rbuf4);
2793: PetscFree4(row2proc, cmap, rmap, allcolumns);
2794: return 0;
2795: }
2797: /*
2798: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2799: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2800: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2801: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2802: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2803: state, and needs to be "assembled" later by compressing B's column space.
2805: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2806: Following this call, C->A & C->B have been created, even if empty.
2807: */
2808: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2809: {
2810: /* If making this function public, change the error returned in this function away from _PLIB. */
2811: Mat_MPIAIJ *aij;
2812: Mat_SeqAIJ *Baij;
2813: PetscBool seqaij, Bdisassembled;
2814: PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2815: PetscScalar v;
2816: const PetscInt *rowindices, *colindices;
2818: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2819: if (A) {
2820: PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);
2822: if (rowemb) {
2823: ISGetLocalSize(rowemb, &m);
2825: } else {
2827: }
2828: if (dcolemb) {
2829: ISGetLocalSize(dcolemb, &n);
2831: } else {
2833: }
2834: }
2835: if (B) {
2836: PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);
2838: if (rowemb) {
2839: ISGetLocalSize(rowemb, &m);
2841: } else {
2843: }
2844: if (ocolemb) {
2845: ISGetLocalSize(ocolemb, &n);
2847: } else {
2849: }
2850: }
2852: aij = (Mat_MPIAIJ *)C->data;
2853: if (!aij->A) {
2854: /* Mimic parts of MatMPIAIJSetPreallocation() */
2855: MatCreate(PETSC_COMM_SELF, &aij->A);
2856: MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n);
2857: MatSetBlockSizesFromMats(aij->A, C, C);
2858: MatSetType(aij->A, MATSEQAIJ);
2859: }
2860: if (A) {
2861: MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A);
2862: } else {
2863: MatSetUp(aij->A);
2864: }
2865: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2866: /*
2867: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2868: need to "disassemble" B -- convert it to using C's global indices.
2869: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2871: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2872: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2874: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2875: At least avoid calling MatSetValues() and the implied searches?
2876: */
2878: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2879: #if defined(PETSC_USE_CTABLE)
2880: PetscTableDestroy(&aij->colmap);
2881: #else
2882: PetscFree(aij->colmap);
2883: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2884: #endif
2885: ngcol = 0;
2886: if (aij->lvec) VecGetSize(aij->lvec, &ngcol);
2887: if (aij->garray) { PetscFree(aij->garray); }
2888: VecDestroy(&aij->lvec);
2889: VecScatterDestroy(&aij->Mvctx);
2890: }
2891: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) MatDestroy(&aij->B);
2892: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) MatZeroEntries(aij->B);
2893: }
2894: Bdisassembled = PETSC_FALSE;
2895: if (!aij->B) {
2896: MatCreate(PETSC_COMM_SELF, &aij->B);
2897: MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N);
2898: MatSetBlockSizesFromMats(aij->B, B, B);
2899: MatSetType(aij->B, MATSEQAIJ);
2900: Bdisassembled = PETSC_TRUE;
2901: }
2902: if (B) {
2903: Baij = (Mat_SeqAIJ *)B->data;
2904: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2905: PetscMalloc1(B->rmap->n, &nz);
2906: for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2907: MatSeqAIJSetPreallocation(aij->B, 0, nz);
2908: PetscFree(nz);
2909: }
2911: PetscLayoutGetRange(C->rmap, &rstart, &rend);
2912: shift = rend - rstart;
2913: count = 0;
2914: rowindices = NULL;
2915: colindices = NULL;
2916: if (rowemb) ISGetIndices(rowemb, &rowindices);
2917: if (ocolemb) ISGetIndices(ocolemb, &colindices);
2918: for (i = 0; i < B->rmap->n; i++) {
2919: PetscInt row;
2920: row = i;
2921: if (rowindices) row = rowindices[i];
2922: for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2923: col = Baij->j[count];
2924: if (colindices) col = colindices[col];
2925: if (Bdisassembled && col >= rstart) col += shift;
2926: v = Baij->a[count];
2927: MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES);
2928: ++count;
2929: }
2930: }
2931: /* No assembly for aij->B is necessary. */
2932: /* FIXME: set aij->B's nonzerostate correctly. */
2933: } else {
2934: MatSetUp(aij->B);
2935: }
2936: C->preallocated = PETSC_TRUE;
2937: C->was_assembled = PETSC_FALSE;
2938: C->assembled = PETSC_FALSE;
2939: /*
2940: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2941: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2942: */
2943: return 0;
2944: }
2946: /*
2947: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2948: */
2949: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2950: {
2951: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2955: /* FIXME: make sure C is assembled */
2956: *A = aij->A;
2957: *B = aij->B;
2958: /* Note that we don't incref *A and *B, so be careful! */
2959: return 0;
2960: }
2962: /*
2963: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2964: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2965: */
2966: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
2967: {
2968: PetscMPIInt size, flag;
2969: PetscInt i, ii, cismax, ispar;
2970: Mat *A, *B;
2971: IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
2973: if (!ismax) return 0;
2975: for (i = 0, cismax = 0; i < ismax; ++i) {
2976: MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag);
2978: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
2979: if (size > 1) ++cismax;
2980: }
2982: /*
2983: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2984: ispar counts the number of parallel ISs across C's comm.
2985: */
2986: MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C));
2987: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2988: (*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat);
2989: return 0;
2990: }
2992: /* if (ispar) */
2993: /*
2994: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2995: These are used to extract the off-diag portion of the resulting parallel matrix.
2996: The row IS for the off-diag portion is the same as for the diag portion,
2997: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2998: */
2999: PetscMalloc2(cismax, &cisrow, cismax, &ciscol);
3000: PetscMalloc1(cismax, &ciscol_p);
3001: for (i = 0, ii = 0; i < ismax; ++i) {
3002: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3003: if (size > 1) {
3004: /*
3005: TODO: This is the part that's ***NOT SCALABLE***.
3006: To fix this we need to extract just the indices of C's nonzero columns
3007: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3008: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3009: to be done without serializing on the IS list, so, most likely, it is best
3010: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3011: */
3012: ISGetNonlocalIS(iscol[i], &(ciscol[ii]));
3013: /* Now we have to
3014: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3015: were sorted on each rank, concatenated they might no longer be sorted;
3016: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3017: indices in the nondecreasing order to the original index positions.
3018: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3019: */
3020: ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii);
3021: ISSort(ciscol[ii]);
3022: ++ii;
3023: }
3024: }
3025: PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p);
3026: for (i = 0, ii = 0; i < ismax; ++i) {
3027: PetscInt j, issize;
3028: const PetscInt *indices;
3030: /*
3031: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3032: */
3033: ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i);
3034: ISSort(isrow[i]);
3035: ISGetLocalSize(isrow[i], &issize);
3036: ISGetIndices(isrow[i], &indices);
3037: for (j = 1; j < issize; ++j) {
3039: }
3040: ISRestoreIndices(isrow[i], &indices);
3041: ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i);
3042: ISSort(iscol[i]);
3043: ISGetLocalSize(iscol[i], &issize);
3044: ISGetIndices(iscol[i], &indices);
3045: for (j = 1; j < issize; ++j) {
3047: }
3048: ISRestoreIndices(iscol[i], &indices);
3049: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3050: if (size > 1) {
3051: cisrow[ii] = isrow[i];
3052: ++ii;
3053: }
3054: }
3055: /*
3056: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3057: array of sequential matrices underlying the resulting parallel matrices.
3058: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3059: contain duplicates.
3061: There are as many diag matrices as there are original index sets. There are only as many parallel
3062: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3064: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3065: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3066: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3067: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3068: with A[i] and B[ii] extracted from the corresponding MPI submat.
3069: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3070: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3071: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3072: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3073: values into A[i] and B[ii] sitting inside the corresponding submat.
3074: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3075: A[i], B[ii], AA[i] or BB[ii] matrices.
3076: */
3077: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3078: if (scall == MAT_INITIAL_MATRIX) PetscMalloc1(ismax, submat);
3080: /* Now obtain the sequential A and B submatrices separately. */
3081: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3082: (*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A);
3083: (*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B);
3085: /*
3086: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3087: matrices A & B have been extracted directly into the parallel matrices containing them, or
3088: simply into the sequential matrix identical with the corresponding A (if size == 1).
3089: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3090: to have the same sparsity pattern.
3091: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3092: must be constructed for C. This is done by setseqmat(s).
3093: */
3094: for (i = 0, ii = 0; i < ismax; ++i) {
3095: /*
3096: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3097: That way we can avoid sorting and computing permutations when reusing.
3098: To this end:
3099: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3100: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3101: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3102: */
3103: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3105: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3106: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3107: if (size > 1) {
3108: if (scall == MAT_INITIAL_MATRIX) {
3109: MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i);
3110: MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE);
3111: MatSetType((*submat)[i], MATMPIAIJ);
3112: PetscLayoutSetUp((*submat)[i]->rmap);
3113: PetscLayoutSetUp((*submat)[i]->cmap);
3114: }
3115: /*
3116: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3117: */
3118: {
3119: Mat AA = A[i], BB = B[ii];
3121: if (AA || BB) {
3122: setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB);
3123: MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY);
3124: MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY);
3125: }
3126: MatDestroy(&AA);
3127: }
3128: ISDestroy(ciscol + ii);
3129: ISDestroy(ciscol_p + ii);
3130: ++ii;
3131: } else { /* if (size == 1) */
3132: if (scall == MAT_REUSE_MATRIX) MatDestroy(&(*submat)[i]);
3133: if (isrow_p[i] || iscol_p[i]) {
3134: MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i);
3135: setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]);
3136: /* Otherwise A is extracted straight into (*submats)[i]. */
3137: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3138: MatDestroy(A + i);
3139: } else (*submat)[i] = A[i];
3140: }
3141: ISDestroy(&isrow_p[i]);
3142: ISDestroy(&iscol_p[i]);
3143: }
3144: PetscFree2(cisrow, ciscol);
3145: PetscFree2(isrow_p, iscol_p);
3146: PetscFree(ciscol_p);
3147: PetscFree(A);
3148: MatDestroySubMatrices(cismax, &B);
3149: return 0;
3150: }
3152: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3153: {
3154: MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ);
3155: return 0;
3156: }