Reorder the accesses - Raspberry Pi Assembler - Free Computer, Programming, Mathematics, Techni

mvn r5, r5, LSL #16 @ r5 <- ~(r5 << 16) fmrx r4, fpscr @ r4 <- fpscr and r4, r4, r5 @ r4 <- r4 & r5 fmxr fpscr, r4 @ fpscr <- r4

vpop {s16-s31} /* Restore preserved floating registers */ pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */

bx lr /* Leave function */

Note that because we now process jandj + 1together, r5 (=j)is now increased by 2 at the end of the loop. This is usually known asloop unrollingand it is always legal to do. We do more than one iteration of the original loop in the unrolled loop. The number of iterations of the original loop we do in the unrolled loop is the unroll factor. In this case since the number of iterations (4) is perfectly divisible by the unrolling factor (2) we do not need an extra loop for any remainder iterations (the remainder loop has one less iteration than the value of the unrolling factor).

As you can see, the accesses to b[k][j]andb[k][j+1] are starting to become tedious. Maybe we should make more changes to the matrix multiply algorithm.

C.6 Reorder the accesses

Is there a way we can mitigate the strided accesses to the matrix B? Yes, there is one, we only have to permute the ordering of the loop nesting variables i, j, k into the order k, i, j. Now you may be wondering if this is legal. Well, checking for the legality of these things is beyond the scope of this book so you will have to trust me here. Such permutation is fine. What does this mean? Well, it means that our algorithm will now look like this in C:

float A[N][N]; float B[M][N]; // Result float C[N][N];

for (int i = 0; i < N; i++) for (int j = 0; j < N; j++)

C[i][j] = 0;

for (int k = 0; k < N; k++) for (int i = 0; i < N; i++)

for (int j = 0; j < N; j++) C[i][j] += A[i][k] * B[k][j];

C. Matrix Multiplication (R.F.I.)

This may not seem very useful, but note that, since now k is in the outermost loop, it is easier to use vectorial instructions.

for (int k = 0; k < N; k++) for (int i = 0; i < N; i++) { C[i][0] += A[i][k] * B[k][0]; C[i][1] += A[i][k] * B[k][1]; C[i][2] += A[i][k] * B[k][2]; C[i][3] += A[i][k] * B[k][3]; }

If you remember Chapter 14, VFPv2 instructions have a mixed mode when theRsource2

B[k][0..3] with a load multiple and then loadA[i][k]into a register in bank 0. Then we can make multiply A[i][k]*B[k][0..3] and add the result to C[i][0..3]. As a bonus, the number of instructions is much smaller.

better_vectorial_matmul_4x4: /* r0 address of A

r1 address of B r2 address of C */

push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */

/* Floating point registers starting from s16 must be preserved */ vpush {s16-s19}

vpush {s24-s27}

/* First zero 16 single floating point */ /* In IEEE 754, all bits cleared means 0 */ mov r4, r2 mov r5, #16 mov r6, #0 b Loop_init_test Loop_init: str r6, [r4], +#4 @ *r4 <- r6 then r4 <- r4 + 4 Loop_init_test: subs r5, r5, #1 bge Loop_init

/* Set the LEN field of FPSCR to be 4 (value 3) */ mov r5, #0b011 @ r5 <- 3

mov r5, r5, LSL #16 @ r5 <- r5 << 16 fmrx r4, fpscr @ r4 <- fpscr orr r4, r4, r5 @ r4 <- r4 | r5 fmxr fpscr, r4 @ fpscr <- r4

C.6. Reorder the accesses /* We will use r4 as k r5 as i */ mov r4, #0 @ r4 <- 0

Loop_k: /* loop header of k */

cmp r4, #4 /* if r4 == 4 goto end of the loop k */ beq End_loop_k

mov r5, #0 @ r5 <- 0

Loop_i: /* loop header of i */

cmp r5, #4 /* if r5 == 4 goto end of the loop i */ beq End_loop_i

/* Compute the address of C[i][0] */ /* Address of C[i][0] is C + 4*(4 * i) */

add r7, r2, r5, LSL #4 @ r7 <- r2 + (r5 << 4) = c + 4*4*i

/* Load {s8,s9,s10,s11} <- {c[i][0],c[i][1],c[i][2],c[i][3]} */ vldmia r7, {s8-s11}

/* Compute the address of A[i][k] = A + 4*(4*i + k) */ add r8, r4, r5, LSL #2 @ r8 <- r4 + r5 << 2 = k + 4*i

add r8, r0, r8, LSL #2 @ r8 <- r0 + r8 << 2 = a + 4*(k + 4*i) vldr s0, [r8] @ Load s0 <- a[i][k]

/* Compute the address of B[k][0] */ /* Address of B[k][0] is B + 4*(4*k) */ add r8, r1, r4, LSL #4 @ r8 <- r1 + r4 << 4 = b + 4*(4*k) /* Load {s16,s17,s18,s19}<-{b[k][0],b[k][1],b[k][2],b[k][3]} */ vldmia r8, {s16-s19} /* {s24,s25,s26,s27} <- {s16,s17,s18,s19} * {s0,s0,s0,s0} */ vmul.f32 s24, s16, s0 /* {s8,s9,s10,s11} <- {s8,s9,s10,s11} + {s24,s25,s26,s7} */ vadd.f32 s8, s8, s24 /* Store {c[i][0],c[i][1],c[i][2],c[i][3]} <- {s8,s9,s10,s11} */ vstmia r7, {s8-s11} add r5, r5, #1 @ r5 <- r5 + 1; i.e., i = i + 1 b Loop_i /* next iteration of loop i */

End_loop_i: /* Here ends loop i */

add r4, r4, #1 @ r4 <- r4 + 1; i.e., k = k + 1 b Loop_k /* next iteration of loop k */

C. Matrix Multiplication (R.F.I.)

End_loop_k: /* Here ends loop k */

/* Set the LEN field of FPSCR back to 1 (value 0) */ mov r5, #0b011 @ r5 <- 3

mvn r5, r5, LSL #16 @ r5 <- ~(r5 << 16) fmrx r4, fpscr @ r4 <- fpscr

and r4, r4, r5 @ r4 <- r4 & r5 fmxr fpscr, r4 @ fpscr <- r4

vpop {s24-s27} /* Restore preserved floating registers */ vpop {s16-s19}

pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */

bx lr /* Leave function */

Since adding after a multiplication is a frequent sequence of operations, we can replace the sequence

vmul.f32 s24, s16, s0

/* {s24,s25,s26,s27} <- {s16,s17,s18,s19} * {s0,s0,s0,s0} */ vadd.f32 s8, s8, s24

/* {s8,s9,s10,s11} <- {s8,s9,s10,s11} + {s24,s25,s26,s27} */

with the single instruction vmla (multiply and add).

vmla.f32 s8, s16, s0

/* {s8,s9,s10,s11}<-{s8,s9,s10,s11}+({s16,s17,s18,s19}*{s0,s0,s0,s0}) */

Now we can also unroll the i loop, again with an unrolling factor of 2. This would give us our best version.

best_vectorial_matmul_4x4: /* r0 address of A

r1 address of B r2 address of C */

push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */

vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */

/* First zero 16 single floating point */ /* In IEEE 754, all bits cleared means 0 */ mov r4, r2

mov r5, #16 mov r6, #0

b Loop_init_test Loop_init:

C.6. Reorder the accesses

str r6, [r4], +#4 @ *r4 <- r6 then r4 <- r4 + 4 */ Loop_init_test:

subs r5, r5, #1 bge Loop_init

/* Set the LEN field of FPSCR to be 4 (value 3)

mov r5, #0b011 @ r5 <- 3 mov r5, r5, LSL #16 @ r5 <- r5 << 16 fmrx r4, fpscr @ r4 <- fpscr orr r4, r4, r5 @ r4 <- r4 | r5 fmxr fpscr, r4 @ fpscr <- r4 /* We will use r4 as k r5 as i */ mov r4, #0 @ r4 <- 0

Loop_k: /* loop header of k */

cmp r4, #4 /* if r4 == 4 goto end of k loop */ beq End_loop_k

mov r5, #0 @ r5 <- 0

Loop_i: /* loop header of i */

cmp r5, #4 /* if r5 == 4 goto end of i loop */ beq End_loop_i

/* Compute the address of C[i][0] */ /* Address of C[i][0] is C + 4*(4 * i) */

add r7, r2, r5, LSL #4 @ r7 <- r2 + (r5 << 4) = c + 4*4*i /* Load {s8,s9,s10,s11,s12,s13,s14,s15}

<- {c[i][0], c[i][1], c[i][2], c[i][3], c[i+1][0], c[i+1][1], c[i+1][2], c[i+1][3]} */ vldmia r7, {s8-s15}

/* Compute the address of A[i][k] = A + 4*(4*i + k) */ add r8, r4, r5, LSL #2 /* r8 <- r4 + r5 << 2 = k + 4*i */ add r8, r0, r8, LSL #2 /* r8 <- r0 + r8 << 2 = a+4*(k+4*i) */ vldr s0, [r8] /* Load s0 <- a[i][k] */

vldr s1, [r8, #16] /* Load s1 <- a[i+1][k] */

/* Compute the address of B[k][0] */ /* Address of B[k][0] is B + 4*(4*k) */

add r8, r1, r4, LSL #4 /* r8 <- r1 + r4 << 4 = b + 4*(4*k) */

/* Load {s16,s17,s18,s19}<-{b[k][0],b[k][1],b[k][2],b[k][3]} */ vldmia r8, {s16-s19}

C. Matrix Multiplication (R.F.I.) /* {s8,s9,s10,s11} <- {s8,s9,s10,s11} + ({s16,s17,s18,s19} * {s0,s0,s0,s0}) */ vmla.f32 s8, s16, s0 /* {s12,s13,s14,s15} <- {s12,s13,s14,s15} + ({s16,s17,s18,s19} * {s1,s1,s1,s1}) */ vmla.f32 s12, s16, s1

/* Store {c[i][0], c[i][1], c[i][2], c[i][3], c[i+1][0], c[i+1][1], c[i+1][2]}, c[i+1][3] } <- {s8,s9,s10,s11,s12,s13,s14,s15} */

vstmia r7, {s8-s15}

add r5, r5, #2 /* r5 <- r5 + 2; i.e., i = i + 2 */ b Loop_i /* next iteration of loop i */ End_loop_i: /* Here ends loop i */

add r4, r4, #1 /* r4 <- r4 + 1; i.e., k = k + 1 */ b Loop_k /* next iteration of loop k */ End_loop_k: /* Here ends loop k */

/* Set the LEN field of FPSCR back to 1 (value 0) mov r5, #0b011 @ r5 <- 3

mvn r5, r5, LSL #16 @ r5 <- ~(r5 << 16) fmrx r4, fpscr @ r4 <- fpscr

and r4, r4, r5 @ r4 <- r4 & r5 fmxr fpscr, r4 @ fpscr <- r4

vpop {s16-s19} /* Restore preserved floating registers */ pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */

bx lr /* Leave function */

In document Raspberry Pi Assembler - Free Computer, Programming, Mathematics, Technical Books, Lecture Notes and Tutorials (Page 185-190)