1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
|
void LMP3D_MatrixMultiply(float* dest,float* src1,float* src2)
{
asm __volatile__(
"lqc2 vf16,0x00(%1)\n"
"lqc2 vf20,0x00(%2)\n"
"lqc2 vf21,0x10(%2)\n"
"lqc2 vf22,0x20(%2)\n"
"lqc2 vf23,0x30(%2)\n"
"lqc2 vf17,0x10(%1)\n"
"lqc2 vf18,0x20(%1)\n"
"lqc2 vf19,0x30(%1)\n"
"vmulax ACC,vf20,vf16\n" //ACC = VF20 * VF16.x
"vmadday ACC,vf21,vf16\n" //ACC = ACC + VF21 * VF16.y
"vmaddaz ACC,vf22,vf16\n" //ACC = ACC + VF22 * VF16.z
"vmaddw vf16,vf23,vf16\n" //VF16 = ACC + VF21 * VF16.w
"vmulax ACC,vf20,vf17\n"
"vmadday ACC,vf21,vf17\n"
"vmaddaz ACC,vf22,vf17\n"
"vmaddw vf17,vf23,vf17\n"
"vmulax ACC,vf20,vf18\n"
"vmadday ACC,vf21,vf18\n"
"vmaddaz ACC,vf22,vf18\n"
"vmaddw vf18,vf23,vf18\n"
"vmulax ACC,vf20,vf19\n"
"vmadday ACC,vf21,vf19\n"
"vmaddaz ACC,vf22,vf19\n"
"vmaddw vf19,vf23,vf19\n"
"sqc2 vf16,0x00(%0)\n"
"sqc2 vf17,0x10(%0)\n"
"sqc2 vf18,0x20(%0)\n"
"sqc2 vf19,0x30(%0)\n"
: : "r"(dest), "r"(src1), "r"(src2) : "memory");
} |