Extremely fast implementation of matrix multiplication.

Format
Plain text
Post date
2023-06-01 14:48
Publication Period
Unlimited
  1. ; YASM/NASM-compatible matrix implementation for amd64, using SSE2
  2. segment .text
  3. global MMath_GFX_CoreInit
  4. MMath_GFX_CoreInit:
  5. xor rax, rax
  6. mov [rel mmath_gfx_flags], rax
  7. push rbx
  8. cpuid
  9. ; Test for AMD.
  10. cmp ebx, 0x68747541
  11. jne .not_amd
  12. cmp edx, 0x69746e65
  13. jne .not_amd
  14. cmp ecx, 0x444d4163
  15. sete BYTE [rel mmath_gfx_is_amd]
  16. .not_amd:
  17. mov eax, 1
  18. cpuid
  19. bt ecx, 19
  20. setc BYTE [rel mmath_gfx_have_sse4_1]
  21. bt ecx, 20
  22. setc BYTE [rel mmath_gfx_have_sse4_2]
  23. bt ecx, 28
  24. jnc .no_avx
  25. mov BYTE [rel mmath_gfx_have_avx], 1
  26. bt ecx, 29
  27. setc BYTE [rel mmath_gfx_have_f16c]
  28. .no_avx:
  29. test BYTE [rel mmath_gfx_is_amd], 1
  30. jz .no_use_sse4_1
  31. test BYTE [rel mmath_gfx_have_sse4_1], 1
  32. setnz BYTE [rel mmath_gfx_use_sse4_1]
  33. .no_use_sse4_1:
  34. pop rbx
  35. ret
  36. global MMath_GFX_CoreInitSoft
  37. MMath_GFX_CoreInitSoft:
  38. xor rax, rax
  39. mov [rel mmath_gfx_flags], rax
  40. ret
  41. ; void MMath_GFX_MatrixTransform(
  42. ; const float *in,
  43. ; const float *matrix,
  44. ; float *out)
  45. ; TODO: Vector call?
  46. global MMath_GFX_MatrixTransform_Win64
  47. MMath_GFX_MatrixTransform_Win64:
  48. emms ; I don't trust older compilers :/
  49. ; fld vec[0]
  50. fld DWORD [rcx]
  51. mov rdi, rcx
  52. ; fld matrix[0]
  53. ; fmul
  54. fmul DWORD [rdx]
  55. movups xmm0, [rcx]
  56. mov rcx, r8
  57. test BYTE [rel mmath_gfx_use_sse4_1], 1
  58. jz mmath_gfx_matrix_transform_sse2
  59. ; FALLTRHOUGH
  60. mmath_gfx_matrix_transform_sse4_1:
  61. ; dpps Doesn't work with older YASM/NASM, we can just write this in.
  62. ; Dot product of vec and matrix[0:3] as x87
  63. ; Intermixed this with SSE2 instructions as an experiment.
  64. ; fld vec[0]
  65. ; fld matrix[0]
  66. ; fmul
  67. ; fld matrix[1]
  68. ; fld vec[1]
  69. ; fmulp
  70. ; faddp
  71. ; fld matrix[2]
  72. ; fld vec[2]
  73. ; fmulp
  74. ; faddp
  75. ; fld matrix[3]
  76. ; fld vec[3]
  77. ; fmulp
  78. ; faddp
  79. ; fstp out[0]
  80. ; TODO: We should be able to load some floats into st0 with movdq2q
  81. ; fld vec[1]
  82. fld DWORD [rdi + 4]
  83. ; SSE4.1 implementation for reference
  84. ; movaps xmm1, xmm0
  85. ; dpps xmm1, [rdx], 0xff
  86. ; db 0x66,0x0F,0x3A,0x40,0x0A,0XFF
  87. ; movss [rcx], xmm1
  88. movaps xmm1, xmm0
  89. ; fld matrix[1]
  90. ; fmulp
  91. fmul DWORD [rdx + 4]
  92. ; dpps xmm1, [rdx + 16], 0XFF
  93. db 0x66,0x0F,0x3A,0x40,0x4A,0X10,0XFF
  94. ; faddp
  95. faddp
  96. movss [rcx + 4], xmm1
  97. ; fld vec[2]
  98. fld DWORD [rdi + 8]
  99. movaps xmm1, xmm0
  100. ; fld matrix[2]
  101. ; fmulp
  102. fmul DWORD [rdx + 8]
  103. ; dpps xmm1, [rdx + 32], 0XFF
  104. db 0x66,0x0F,0x3A,0x40,0x4A,0X20,0XFF
  105. ; faddp
  106. faddp
  107. movss [rcx + 8], xmm1
  108. ; Two ops here, this pipelines best before a dpps
  109. ; (Don't tell me it doesn't, go ahead and change it before or after and
  110. ; see how it gets 3% slower when you run the perf tests)
  111. ; fld vec[3]
  112. fld DWORD [rdi + 12]
  113. ; fld matrix[3]
  114. ; fmulp
  115. fmul DWORD [rdx + 12]
  116. ; dpps xmm0, [rdx + 40], 0XFF
  117. db 0x66,0x0F,0x3A,0x40,0x42,0X30,0XFF
  118. ; faddp
  119. faddp
  120. movss [rcx + 12], xmm0
  121. ; fstp out[0]
  122. fstp DWORD [rcx]
  123. ret
  124. global MMath_GFX_MatrixTransform
  125. MMath_GFX_MatrixTransform:
  126. ; fld vec[0]
  127. fld DWORD [rdi]
  128. movups xmm0, [rdi]
  129. mov rcx, rdx
  130. ; fld matrix[0]
  131. ; fmul
  132. fmul DWORD [rsi]
  133. mov rdx, rsi
  134. test BYTE [rel mmath_gfx_use_sse4_1], 1
  135. jnz mmath_gfx_matrix_transform_sse4_1
  136. ; FALLTHROUGH
  137. mmath_gfx_matrix_transform_sse2:
  138. ; Dot product of vec and matrix[0:3] as x87
  139. ; Intermixed this with SSE2 instructions as an experiment.
  140. ; fld vec[0]
  141. ; fld matrix[0]
  142. ; fmul
  143. ; fld matrix[1]
  144. ; fld vec[1]
  145. ; fmulp
  146. ; faddp
  147. ; fld matrix[2]
  148. ; fld vec[2]
  149. ; fmulp
  150. ; faddp
  151. ; fld matrix[3]
  152. ; fld vec[3]
  153. ; fmulp
  154. ; faddp
  155. ; fstp out[0]
  156. ; SSE 2 version for reference
  157. ; movups xmm1, [rdx]
  158. ; mulps xmm1, xmm0
  159. ; movhlps xmm2, xmm1
  160. ; addps xmm1, xmm2
  161. ; pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
  162. ; addps xmm1, xmm2
  163. ; movss [rcx], xmm1
  164. ; fld vec[0]
  165. ; fld DWORD [rdi]
  166. ; Dot product of vec and matrix[4:7]
  167. movups xmm1, [rdx + 16]
  168. ; fld matrix[0]
  169. ; fmulp
  170. ; fmul DWORD [rdx]
  171. mulps xmm1, xmm0
  172. ; fld vec[1]
  173. fld DWORD [rdi + 4]
  174. movhlps xmm2, xmm1
  175. ; fld matrix[1]
  176. ; fmulp
  177. fmul DWORD [rdx + 4]
  178. addps xmm1, xmm2
  179. ; faddp
  180. faddp
  181. pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
  182. ; fld vec[2]
  183. fld DWORD [rdi + 8]
  184. addps xmm1, xmm2
  185. ; fld matrix[2]
  186. ; fmulp
  187. fmul DWORD [rdx + 8]
  188. movss [rcx + 4], xmm1
  189. ; faddp
  190. faddp
  191. ; Dot product of vec and matrix[8:11]
  192. movups xmm1, [rdx + 32]
  193. ; fld vec[3]
  194. fld DWORD [rdi + 12]
  195. mulps xmm1, xmm0
  196. ; fld matrix[3]
  197. ; fmulp
  198. fmul DWORD [rdx + 12]
  199. movhlps xmm2, xmm1
  200. ; faddp
  201. faddp
  202. addps xmm1, xmm2
  203. ; fstp out[0]
  204. fstp DWORD [rcx]
  205. pshufd xmm2, xmm1, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
  206. addps xmm1, xmm2
  207. movss [rcx + 8], xmm1
  208. ; Dot product of vec and matrix[12:15]
  209. ; Destructively update xmm0 to avoid the need for more loading.
  210. mulps xmm0, [rdx + 48]
  211. movhlps xmm2, xmm0
  212. addps xmm0, xmm2
  213. pshufd xmm2, xmm0, 0x1 ; _MM_SHUFFLE(0, 0, 0, 1)
  214. addps xmm0, xmm2
  215. movss [rcx + 12], xmm0
  216. ret
  217. segment .data
  218. mmath_matrix_transform: dd 0,0
  219. global mmath_gfx_flags
  220. global _mmath_gfx_flags
  221. mmath_gfx_flags: ; FALLRHOUGH
  222. _mmath_gfx_flags: ; FALLRHOUGH
  223. mmath_gfx_have_sse4_1: db 0
  224. mmath_gfx_have_sse4_2: db 0
  225. mmath_gfx_have_avx: db 0
  226. mmath_gfx_have_f16c: db 0
  227. mmath_gfx_is_amd: db 0
  228. mmath_gfx_use_sse4_1: db 0
다운로드 Printable view

URL of this paste

Embed with JavaScript

Embed with iframe

Raw text