Optimized Vector Dot Product
; clear A4 and initialize pointers A5, A6, and A7
MVK .S1 40,A2 ; A2 = 40 (loop counter)
loop LDW .D1 *A5++,A0 ; load a(n) and a(n+1)
LDW .D2 *B6++,B1 ; load x(n) and x(n+1)
MPY .M1X A0,B1,A3 ; A3 = a(n) * x(n)
MPYH .M2X A0,B1,B3 ; B3 = a(n+1) * x(n+1)
ADD .L1 A3,A4,A4 ; Yeven = Yeven + A3
ADD .L2 B3,B4,B4 ; Yodd = Yodd + A3
SUB .S1 A2,1,A2 ; decrement loop counter
[A2] B .S2 loop ; if A2 != 0, then branch
ADD .L1 A4,B4,A4 ; Y = Yodd + Yeven
Retime summation-- compute odd/even indexed terms at same time-- utilize all eight functional units in the loop-- put the sequential instructions in parallel