/*
     * Signed 64-bit integer multiply.
     *
     * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
     *        WX
     *      x YZ
     *  --------
     *     ZW ZX
     *  YW YX
     *
     * The low word of the result holds ZX, the high word holds
     * (ZW+YX) + (the high overflow from ZX).  YW doesn't matter because
     * it doesn't fit in the low 64 bits.
     *
     * Unlike most ARM math operations, multiply instructions have
     * restrictions on using the same register more than once (Rd and Rm
     * cannot be the same).
     */
    /* mul-long vAA, vBB, vCC */
    FETCH r0, 1                         @ r0<- CCBB
    and     r2, r0, #255                @ r2<- BB
    mov     r3, r0, lsr #8              @ r3<- CC
    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &fp[BB]
    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
    ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
    ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
    mul     ip, r2, r1                  @ ip<- ZxW
    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
    mov     r0, rINST, lsr #8           @ r0<- AA
    add     r2, r2, lr                  @ r2<- lr + low(ZxW + (YxX))
    VREG_INDEX_TO_ADDR r0, r0           @ r0<- &fp[AA]
    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
    GET_INST_OPCODE ip                  @ extract opcode from rINST
    stmia   r0, {r1-r2 }                @ vAA/vAA+1<- r1/r2
    GOTO_OPCODE ip                      @ jump to next instruction