Actual source code: baijsolvtrannat3.c
1: #include <../src/mat/impls/baij/seq/baij.h>
3: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
4: {
5: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
6: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag;
7: PetscInt i, nz, idx, idt, oidx;
8: const MatScalar *aa = a->a, *v;
9: PetscScalar s1, s2, s3, x1, x2, x3, *x;
11: PetscFunctionBegin;
12: PetscCall(VecCopy(bb, xx));
13: PetscCall(VecGetArray(xx, &x));
15: /* forward solve the U^T */
16: idx = 0;
17: for (i = 0; i < n; i++) {
18: v = aa + 9 * diag[i];
19: /* multiply by the inverse of the block diagonal */
20: x1 = x[idx];
21: x2 = x[1 + idx];
22: x3 = x[2 + idx];
23: s1 = v[0] * x1 + v[1] * x2 + v[2] * x3;
24: s2 = v[3] * x1 + v[4] * x2 + v[5] * x3;
25: s3 = v[6] * x1 + v[7] * x2 + v[8] * x3;
26: v += 9;
28: vi = aj + diag[i] + 1;
29: nz = ai[i + 1] - diag[i] - 1;
30: while (nz--) {
31: oidx = 3 * (*vi++);
32: x[oidx] -= v[0] * s1 + v[1] * s2 + v[2] * s3;
33: x[oidx + 1] -= v[3] * s1 + v[4] * s2 + v[5] * s3;
34: x[oidx + 2] -= v[6] * s1 + v[7] * s2 + v[8] * s3;
35: v += 9;
36: }
37: x[idx] = s1;
38: x[1 + idx] = s2;
39: x[2 + idx] = s3;
40: idx += 3;
41: }
42: /* backward solve the L^T */
43: for (i = n - 1; i >= 0; i--) {
44: v = aa + 9 * diag[i] - 9;
45: vi = aj + diag[i] - 1;
46: nz = diag[i] - ai[i];
47: idt = 3 * i;
48: s1 = x[idt];
49: s2 = x[1 + idt];
50: s3 = x[2 + idt];
51: while (nz--) {
52: idx = 3 * (*vi--);
53: x[idx] -= v[0] * s1 + v[1] * s2 + v[2] * s3;
54: x[idx + 1] -= v[3] * s1 + v[4] * s2 + v[5] * s3;
55: x[idx + 2] -= v[6] * s1 + v[7] * s2 + v[8] * s3;
56: v -= 9;
57: }
58: }
59: PetscCall(VecRestoreArray(xx, &x));
60: PetscCall(PetscLogFlops(2.0 * 9.0 * (a->nz) - 3.0 * A->cmap->n));
61: PetscFunctionReturn(PETSC_SUCCESS);
62: }
64: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A, Vec bb, Vec xx)
65: {
66: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
67: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag;
68: PetscInt nz, idx, idt, j, i, oidx;
69: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
70: const MatScalar *aa = a->a, *v;
71: PetscScalar s1, s2, s3, x1, x2, x3, *x;
73: PetscFunctionBegin;
74: PetscCall(VecCopy(bb, xx));
75: PetscCall(VecGetArray(xx, &x));
77: /* forward solve the U^T */
78: idx = 0;
79: for (i = 0; i < n; i++) {
80: v = aa + bs2 * diag[i];
81: /* multiply by the inverse of the block diagonal */
82: x1 = x[idx];
83: x2 = x[1 + idx];
84: x3 = x[2 + idx];
85: s1 = v[0] * x1 + v[1] * x2 + v[2] * x3;
86: s2 = v[3] * x1 + v[4] * x2 + v[5] * x3;
87: s3 = v[6] * x1 + v[7] * x2 + v[8] * x3;
88: v -= bs2;
90: vi = aj + diag[i] - 1;
91: nz = diag[i] - diag[i + 1] - 1;
92: for (j = 0; j > -nz; j--) {
93: oidx = bs * vi[j];
94: x[oidx] -= v[0] * s1 + v[1] * s2 + v[2] * s3;
95: x[oidx + 1] -= v[3] * s1 + v[4] * s2 + v[5] * s3;
96: x[oidx + 2] -= v[6] * s1 + v[7] * s2 + v[8] * s3;
97: v -= bs2;
98: }
99: x[idx] = s1;
100: x[1 + idx] = s2;
101: x[2 + idx] = s3;
102: idx += bs;
103: }
104: /* backward solve the L^T */
105: for (i = n - 1; i >= 0; i--) {
106: v = aa + bs2 * ai[i];
107: vi = aj + ai[i];
108: nz = ai[i + 1] - ai[i];
109: idt = bs * i;
110: s1 = x[idt];
111: s2 = x[1 + idt];
112: s3 = x[2 + idt];
113: for (j = 0; j < nz; j++) {
114: idx = bs * vi[j];
115: x[idx] -= v[0] * s1 + v[1] * s2 + v[2] * s3;
116: x[idx + 1] -= v[3] * s1 + v[4] * s2 + v[5] * s3;
117: x[idx + 2] -= v[6] * s1 + v[7] * s2 + v[8] * s3;
118: v += bs2;
119: }
120: }
121: PetscCall(VecRestoreArray(xx, &x));
122: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }