Matrix Computation Example (cont.)
*** begin piplining inner loop
|| LDW .D1T1 *aptr++(4),aa0 ;1 load a[i] from memory
|| LDW .D2T2 *bptr,bb0 ;1 load b[i] from memory
|| SUB .S2X colms,1,lcntr ; load cntr = comumns - 1
[lcntr] LDW .D1T1 *aptr++(4),aa0 ;2 if(lcntr) load a[i] from memory
|| [lcntr] LDW .D2T2 *btmp++(4),bb0 ;2 if(lcntr) load b[i] from memory
|| [lcntr] SUB .L2 lcntr,1,lcntr ;2 if(lcntr) lcntr -= 1
|| SUB .S1 colms,2,icntr ;
|| ZERO .L1 sum0 ; zero the running sum
[lcntr] LDW .D1T1 *aptr++(4),aa0 ;3 if(lcntr) load a[i] from memory
|| [lcntr] LDW .D2T2 *btmp++(4),bb0 ;3 if(lcntr) load b[i] from memory
|| [lcntr] SUB .L2 lcntr,1,lcntr ;3 if(lcntr) lcntr -= 1
[lcntr] LDW .D1T1 *aptr++(4),aa0 ;4 if(lcntr) load a[i] from memory
|| [lcntr] LDW .D2T2 *btmp++(4),bb0 ;4 if(lcntr) load b[i] from memory
|| [lcntr] SUB .L2 lcntr,1,lcntr ;4 if(lcntr) lcntr -= 1