Actual source code: baijsolvnat4.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /*
  5:       Special case where the matrix was ILU(0) factored in the natural
  6:    ordering. This eliminates the need for the column and row permutation.
  7: */
  8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
  9: {
 10:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
 11:   PetscInt          n  =a->mbs;
 12:   const PetscInt    *ai=a->i,*aj=a->j;
 13:   const PetscInt    *diag = a->diag;
 14:   const MatScalar   *aa   =a->a;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 18:   VecGetArrayRead(bb,&b);
 19:   VecGetArray(xx,&x);

 21: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
 22:   {
 23:     static PetscScalar w[2000]; /* very BAD need to fix */
 24:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
 25:   }
 26: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
 27:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
 28: #else
 29:   {
 30:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
 31:     const MatScalar *v;
 32:     PetscInt        jdx,idt,idx,nz,i,ai16;
 33:     const PetscInt  *vi;

 35:     /* forward solve the lower triangular */
 36:     idx  = 0;
 37:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
 38:     for (i=1; i<n; i++) {
 39:       v    =  aa      + 16*ai[i];
 40:       vi   =  aj      + ai[i];
 41:       nz   =  diag[i] - ai[i];
 42:       idx +=  4;
 43:       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
 44:       while (nz--) {
 45:         jdx = 4*(*vi++);
 46:         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
 47:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
 48:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
 49:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
 50:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
 51:         v  += 16;
 52:       }
 53:       x[idx]   = s1;
 54:       x[1+idx] = s2;
 55:       x[2+idx] = s3;
 56:       x[3+idx] = s4;
 57:     }
 58:     /* backward solve the upper triangular */
 59:     idt = 4*(n-1);
 60:     for (i=n-1; i>=0; i--) {
 61:       ai16 = 16*diag[i];
 62:       v    = aa + ai16 + 16;
 63:       vi   = aj + diag[i] + 1;
 64:       nz   = ai[i+1] - diag[i] - 1;
 65:       s1   = x[idt];  s2 = x[1+idt];
 66:       s3   = x[2+idt];s4 = x[3+idt];
 67:       while (nz--) {
 68:         idx = 4*(*vi++);
 69:         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
 70:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
 71:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
 72:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
 73:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
 74:         v  += 16;
 75:       }
 76:       v        = aa + ai16;
 77:       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
 78:       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
 79:       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
 80:       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
 81:       idt     -= 4;
 82:     }
 83:   }
 84: #endif

 86:   VecRestoreArrayRead(bb,&b);
 87:   VecRestoreArray(xx,&x);
 88:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
 89:   return 0;
 90: }

 92: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
 93: {
 94:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
 95:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
 96:   PetscInt          i,k,nz,idx,jdx,idt;
 97:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
 98:   const MatScalar   *aa=a->a,*v;
 99:   PetscScalar       *x;
100:   const PetscScalar *b;
101:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

103:   VecGetArrayRead(bb,&b);
104:   VecGetArray(xx,&x);
105:   /* forward solve the lower triangular */
106:   idx  = 0;
107:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
108:   for (i=1; i<n; i++) {
109:     v   = aa + bs2*ai[i];
110:     vi  = aj + ai[i];
111:     nz  = ai[i+1] - ai[i];
112:     idx = bs*i;
113:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
114:     for (k=0; k<nz; k++) {
115:       jdx = bs*vi[k];
116:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
117:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
118:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
119:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
120:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

122:       v +=  bs2;
123:     }

125:     x[idx]   = s1;
126:     x[1+idx] = s2;
127:     x[2+idx] = s3;
128:     x[3+idx] = s4;
129:   }

131:   /* backward solve the upper triangular */
132:   for (i=n-1; i>=0; i--) {
133:     v   = aa + bs2*(adiag[i+1]+1);
134:     vi  = aj + adiag[i+1]+1;
135:     nz  = adiag[i] - adiag[i+1]-1;
136:     idt = bs*i;
137:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];

139:     for (k=0; k<nz; k++) {
140:       idx = bs*vi[k];
141:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
142:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
143:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
144:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
145:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

147:       v +=  bs2;
148:     }
149:     /* x = inv_diagonal*x */
150:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
151:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
152:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
153:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

155:   }

157:   VecRestoreArrayRead(bb,&b);
158:   VecRestoreArray(xx,&x);
159:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
160:   return 0;
161: }

163: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
164: {
165:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
166:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
167:   const MatScalar   *aa=a->a;
168:   const PetscScalar *b;
169:   PetscScalar       *x;

171:   VecGetArrayRead(bb,&b);
172:   VecGetArray(xx,&x);

174:   {
175:     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
176:     const MatScalar *v;
177:     MatScalar       *t=(MatScalar*)x;
178:     PetscInt        jdx,idt,idx,nz,i,ai16;
179:     const PetscInt  *vi;

181:     /* forward solve the lower triangular */
182:     idx  = 0;
183:     t[0] = (MatScalar)b[0];
184:     t[1] = (MatScalar)b[1];
185:     t[2] = (MatScalar)b[2];
186:     t[3] = (MatScalar)b[3];
187:     for (i=1; i<n; i++) {
188:       v    =  aa      + 16*ai[i];
189:       vi   =  aj      + ai[i];
190:       nz   =  diag[i] - ai[i];
191:       idx +=  4;
192:       s1   = (MatScalar)b[idx];
193:       s2   = (MatScalar)b[1+idx];
194:       s3   = (MatScalar)b[2+idx];
195:       s4   = (MatScalar)b[3+idx];
196:       while (nz--) {
197:         jdx = 4*(*vi++);
198:         x1  = t[jdx];
199:         x2  = t[1+jdx];
200:         x3  = t[2+jdx];
201:         x4  = t[3+jdx];
202:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
203:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
204:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
205:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
206:         v  += 16;
207:       }
208:       t[idx]   = s1;
209:       t[1+idx] = s2;
210:       t[2+idx] = s3;
211:       t[3+idx] = s4;
212:     }
213:     /* backward solve the upper triangular */
214:     idt = 4*(n-1);
215:     for (i=n-1; i>=0; i--) {
216:       ai16 = 16*diag[i];
217:       v    = aa + ai16 + 16;
218:       vi   = aj + diag[i] + 1;
219:       nz   = ai[i+1] - diag[i] - 1;
220:       s1   = t[idt];
221:       s2   = t[1+idt];
222:       s3   = t[2+idt];
223:       s4   = t[3+idt];
224:       while (nz--) {
225:         idx = 4*(*vi++);
226:         x1  = (MatScalar)x[idx];
227:         x2  = (MatScalar)x[1+idx];
228:         x3  = (MatScalar)x[2+idx];
229:         x4  = (MatScalar)x[3+idx];
230:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
231:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
232:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
233:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
234:         v  += 16;
235:       }
236:       v        = aa + ai16;
237:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
238:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
239:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
240:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
241:       idt     -= 4;
242:     }
243:   }

245:   VecRestoreArrayRead(bb,&b);
246:   VecRestoreArray(xx,&x);
247:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
248:   return 0;
249: }

251: #if defined(PETSC_HAVE_SSE)

253: #include PETSC_HAVE_SSE
254: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
255: {
256:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
257:   unsigned short *aj=(unsigned short*)a->j;
258:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
259:   MatScalar      *aa=a->a;
260:   PetscScalar    *x,*b;

262:   SSE_SCOPE_BEGIN;
263:   /*
264:      Note: This code currently uses demotion of double
265:      to float when performing the mixed-mode computation.
266:      This may not be numerically reasonable for all applications.
267:   */
268:   PREFETCH_NTA(aa+16*ai[1]);

270:   VecGetArray(bb,&b);
271:   VecGetArray(xx,&x);
272:   {
273:     /* x will first be computed in single precision then promoted inplace to double */
274:     MatScalar      *v,*t=(MatScalar*)x;
275:     int            nz,i,idt,ai16;
276:     unsigned int   jdx,idx;
277:     unsigned short *vi;
278:     /* Forward solve the lower triangular factor. */

280:     /* First block is the identity. */
281:     idx = 0;
282:     CONVERT_DOUBLE4_FLOAT4(t,b);
283:     v =  aa + 16*((unsigned int)ai[1]);

285:     for (i=1; i<n;) {
286:       PREFETCH_NTA(&v[8]);
287:       vi   =  aj      + ai[i];
288:       nz   =  diag[i] - ai[i];
289:       idx +=  4;

291:       /* Demote RHS from double to float. */
292:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
293:       LOAD_PS(&t[idx],XMM7);

295:       while (nz--) {
296:         PREFETCH_NTA(&v[16]);
297:         jdx = 4*((unsigned int)(*vi++));

299:         /* 4x4 Matrix-Vector product with negative accumulation: */
300:         SSE_INLINE_BEGIN_2(&t[jdx],v)
301:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

303:         /* First Column */
304:         SSE_COPY_PS(XMM0,XMM6)
305:         SSE_SHUFFLE(XMM0,XMM0,0x00)
306:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
307:         SSE_SUB_PS(XMM7,XMM0)

309:         /* Second Column */
310:         SSE_COPY_PS(XMM1,XMM6)
311:         SSE_SHUFFLE(XMM1,XMM1,0x55)
312:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
313:         SSE_SUB_PS(XMM7,XMM1)

315:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

317:         /* Third Column */
318:         SSE_COPY_PS(XMM2,XMM6)
319:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
320:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
321:         SSE_SUB_PS(XMM7,XMM2)

323:         /* Fourth Column */
324:         SSE_COPY_PS(XMM3,XMM6)
325:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
326:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
327:         SSE_SUB_PS(XMM7,XMM3)
328:         SSE_INLINE_END_2

330:         v += 16;
331:       }
332:       v =  aa + 16*ai[++i];
333:       PREFETCH_NTA(v);
334:       STORE_PS(&t[idx],XMM7);
335:     }

337:     /* Backward solve the upper triangular factor.*/

339:     idt  = 4*(n-1);
340:     ai16 = 16*diag[n-1];
341:     v    = aa + ai16 + 16;
342:     for (i=n-1; i>=0;) {
343:       PREFETCH_NTA(&v[8]);
344:       vi = aj + diag[i] + 1;
345:       nz = ai[i+1] - diag[i] - 1;

347:       LOAD_PS(&t[idt],XMM7);

349:       while (nz--) {
350:         PREFETCH_NTA(&v[16]);
351:         idx = 4*((unsigned int)(*vi++));

353:         /* 4x4 Matrix-Vector Product with negative accumulation: */
354:         SSE_INLINE_BEGIN_2(&t[idx],v)
355:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

357:         /* First Column */
358:         SSE_COPY_PS(XMM0,XMM6)
359:         SSE_SHUFFLE(XMM0,XMM0,0x00)
360:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
361:         SSE_SUB_PS(XMM7,XMM0)

363:         /* Second Column */
364:         SSE_COPY_PS(XMM1,XMM6)
365:         SSE_SHUFFLE(XMM1,XMM1,0x55)
366:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
367:         SSE_SUB_PS(XMM7,XMM1)

369:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

371:         /* Third Column */
372:         SSE_COPY_PS(XMM2,XMM6)
373:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
374:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
375:         SSE_SUB_PS(XMM7,XMM2)

377:         /* Fourth Column */
378:         SSE_COPY_PS(XMM3,XMM6)
379:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
380:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
381:         SSE_SUB_PS(XMM7,XMM3)
382:         SSE_INLINE_END_2
383:         v += 16;
384:       }
385:       v    = aa + ai16;
386:       ai16 = 16*diag[--i];
387:       PREFETCH_NTA(aa+ai16+16);
388:       /*
389:          Scale the result by the diagonal 4x4 block,
390:          which was inverted as part of the factorization
391:       */
392:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
393:       /* First Column */
394:       SSE_COPY_PS(XMM0,XMM7)
395:       SSE_SHUFFLE(XMM0,XMM0,0x00)
396:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

398:       /* Second Column */
399:       SSE_COPY_PS(XMM1,XMM7)
400:       SSE_SHUFFLE(XMM1,XMM1,0x55)
401:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
402:       SSE_ADD_PS(XMM0,XMM1)

404:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

406:       /* Third Column */
407:       SSE_COPY_PS(XMM2,XMM7)
408:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
409:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
410:       SSE_ADD_PS(XMM0,XMM2)

412:       /* Fourth Column */
413:       SSE_COPY_PS(XMM3,XMM7)
414:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
415:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
416:       SSE_ADD_PS(XMM0,XMM3)

418:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
419:       SSE_INLINE_END_3

421:       v    = aa + ai16 + 16;
422:       idt -= 4;
423:     }

425:     /* Convert t from single precision back to double precision (inplace)*/
426:     idt = 4*(n-1);
427:     for (i=n-1; i>=0; i--) {
428:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
429:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
430:       PetscScalar *xtemp=&x[idt];
431:       MatScalar   *ttemp=&t[idt];
432:       xtemp[3] = (PetscScalar)ttemp[3];
433:       xtemp[2] = (PetscScalar)ttemp[2];
434:       xtemp[1] = (PetscScalar)ttemp[1];
435:       xtemp[0] = (PetscScalar)ttemp[0];
436:       idt     -= 4;
437:     }

439:   } /* End of artificial scope. */
440:   VecRestoreArray(bb,&b);
441:   VecRestoreArray(xx,&x);
442:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
443:   SSE_SCOPE_END;
444:   return 0;
445: }

447: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
448: {
449:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
450:   int            *aj=a->j;
451:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
452:   MatScalar      *aa=a->a;
453:   PetscScalar    *x,*b;

455:   SSE_SCOPE_BEGIN;
456:   /*
457:      Note: This code currently uses demotion of double
458:      to float when performing the mixed-mode computation.
459:      This may not be numerically reasonable for all applications.
460:   */
461:   PREFETCH_NTA(aa+16*ai[1]);

463:   VecGetArray(bb,&b);
464:   VecGetArray(xx,&x);
465:   {
466:     /* x will first be computed in single precision then promoted inplace to double */
467:     MatScalar *v,*t=(MatScalar*)x;
468:     int       nz,i,idt,ai16;
469:     int       jdx,idx;
470:     int       *vi;
471:     /* Forward solve the lower triangular factor. */

473:     /* First block is the identity. */
474:     idx = 0;
475:     CONVERT_DOUBLE4_FLOAT4(t,b);
476:     v =  aa + 16*ai[1];

478:     for (i=1; i<n;) {
479:       PREFETCH_NTA(&v[8]);
480:       vi   =  aj      + ai[i];
481:       nz   =  diag[i] - ai[i];
482:       idx +=  4;

484:       /* Demote RHS from double to float. */
485:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
486:       LOAD_PS(&t[idx],XMM7);

488:       while (nz--) {
489:         PREFETCH_NTA(&v[16]);
490:         jdx = 4*(*vi++);
491: /*          jdx = *vi++; */

493:         /* 4x4 Matrix-Vector product with negative accumulation: */
494:         SSE_INLINE_BEGIN_2(&t[jdx],v)
495:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

497:         /* First Column */
498:         SSE_COPY_PS(XMM0,XMM6)
499:         SSE_SHUFFLE(XMM0,XMM0,0x00)
500:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
501:         SSE_SUB_PS(XMM7,XMM0)

503:         /* Second Column */
504:         SSE_COPY_PS(XMM1,XMM6)
505:         SSE_SHUFFLE(XMM1,XMM1,0x55)
506:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
507:         SSE_SUB_PS(XMM7,XMM1)

509:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

511:         /* Third Column */
512:         SSE_COPY_PS(XMM2,XMM6)
513:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
514:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
515:         SSE_SUB_PS(XMM7,XMM2)

517:         /* Fourth Column */
518:         SSE_COPY_PS(XMM3,XMM6)
519:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
520:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
521:         SSE_SUB_PS(XMM7,XMM3)
522:         SSE_INLINE_END_2

524:         v += 16;
525:       }
526:       v =  aa + 16*ai[++i];
527:       PREFETCH_NTA(v);
528:       STORE_PS(&t[idx],XMM7);
529:     }

531:     /* Backward solve the upper triangular factor.*/

533:     idt  = 4*(n-1);
534:     ai16 = 16*diag[n-1];
535:     v    = aa + ai16 + 16;
536:     for (i=n-1; i>=0;) {
537:       PREFETCH_NTA(&v[8]);
538:       vi = aj + diag[i] + 1;
539:       nz = ai[i+1] - diag[i] - 1;

541:       LOAD_PS(&t[idt],XMM7);

543:       while (nz--) {
544:         PREFETCH_NTA(&v[16]);
545:         idx = 4*(*vi++);
546: /*          idx = *vi++; */

548:         /* 4x4 Matrix-Vector Product with negative accumulation: */
549:         SSE_INLINE_BEGIN_2(&t[idx],v)
550:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

552:         /* First Column */
553:         SSE_COPY_PS(XMM0,XMM6)
554:         SSE_SHUFFLE(XMM0,XMM0,0x00)
555:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
556:         SSE_SUB_PS(XMM7,XMM0)

558:         /* Second Column */
559:         SSE_COPY_PS(XMM1,XMM6)
560:         SSE_SHUFFLE(XMM1,XMM1,0x55)
561:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
562:         SSE_SUB_PS(XMM7,XMM1)

564:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

566:         /* Third Column */
567:         SSE_COPY_PS(XMM2,XMM6)
568:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
569:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
570:         SSE_SUB_PS(XMM7,XMM2)

572:         /* Fourth Column */
573:         SSE_COPY_PS(XMM3,XMM6)
574:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
575:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
576:         SSE_SUB_PS(XMM7,XMM3)
577:         SSE_INLINE_END_2
578:         v += 16;
579:       }
580:       v    = aa + ai16;
581:       ai16 = 16*diag[--i];
582:       PREFETCH_NTA(aa+ai16+16);
583:       /*
584:          Scale the result by the diagonal 4x4 block,
585:          which was inverted as part of the factorization
586:       */
587:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
588:       /* First Column */
589:       SSE_COPY_PS(XMM0,XMM7)
590:       SSE_SHUFFLE(XMM0,XMM0,0x00)
591:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

593:       /* Second Column */
594:       SSE_COPY_PS(XMM1,XMM7)
595:       SSE_SHUFFLE(XMM1,XMM1,0x55)
596:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
597:       SSE_ADD_PS(XMM0,XMM1)

599:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

601:       /* Third Column */
602:       SSE_COPY_PS(XMM2,XMM7)
603:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
604:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
605:       SSE_ADD_PS(XMM0,XMM2)

607:       /* Fourth Column */
608:       SSE_COPY_PS(XMM3,XMM7)
609:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
610:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
611:       SSE_ADD_PS(XMM0,XMM3)

613:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
614:       SSE_INLINE_END_3

616:       v    = aa + ai16 + 16;
617:       idt -= 4;
618:     }

620:     /* Convert t from single precision back to double precision (inplace)*/
621:     idt = 4*(n-1);
622:     for (i=n-1; i>=0; i--) {
623:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
624:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
625:       PetscScalar *xtemp=&x[idt];
626:       MatScalar   *ttemp=&t[idt];
627:       xtemp[3] = (PetscScalar)ttemp[3];
628:       xtemp[2] = (PetscScalar)ttemp[2];
629:       xtemp[1] = (PetscScalar)ttemp[1];
630:       xtemp[0] = (PetscScalar)ttemp[0];
631:       idt     -= 4;
632:     }

634:   } /* End of artificial scope. */
635:   VecRestoreArray(bb,&b);
636:   VecRestoreArray(xx,&x);
637:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
638:   SSE_SCOPE_END;
639:   return 0;
640: }

642: #endif