304 lines
5.2 KiB
ArmAsm
304 lines
5.2 KiB
ArmAsm
|
.global _rsqrt_inaccurate
|
||
|
.global rsqrt_inaccurate
|
||
|
.global _rsqrt_full
|
||
|
.global rsqrt_full
|
||
|
.global _rsqrt_full_gpr
|
||
|
.global rsqrt_full_gpr
|
||
|
.global _rsqrt_full_nb
|
||
|
.global rsqrt_full_nb
|
||
|
.global _rsqrt_full_nb2
|
||
|
.global rsqrt_full_nb2
|
||
|
.global _rsqrt_full_nb_gpr
|
||
|
.global rsqrt_full_nb_gpr
|
||
|
.global _rsqrt_newton
|
||
|
.global rsqrt_newton
|
||
|
.global _rsqrt_hack
|
||
|
.global rsqrt_hack
|
||
|
.global _rsqrt_fallback
|
||
|
|
||
|
.text
|
||
|
.intel_syntax noprefix
|
||
|
|
||
|
.align 16
|
||
|
min_pos_denorm:
|
||
|
.long 0x00800000,0,0,0
|
||
|
penultimate_bit:
|
||
|
.long 0x00008000,0,0,0
|
||
|
ultimate_bit:
|
||
|
.long 0x00004000,0,0,0
|
||
|
top_mask:
|
||
|
.long 0xFFFF8000,0,0,0
|
||
|
one:
|
||
|
.long 0x3f800000,0,0,0
|
||
|
half:
|
||
|
.long 0x3f000000,0,0,0
|
||
|
one_point_five:
|
||
|
.long 0x3fc00000,0,0,0
|
||
|
magic1:
|
||
|
.long 0x60000000,0,0,0
|
||
|
magic2:
|
||
|
.long 0x3c000000,0,0,0
|
||
|
magic3:
|
||
|
.long 0x000047ff,0,0,0
|
||
|
|
||
|
_rsqrt_inaccurate:
|
||
|
rsqrt_inaccurate:
|
||
|
movd xmm0, edi
|
||
|
|
||
|
rsqrtss xmm0, xmm0
|
||
|
|
||
|
movd eax, xmm0
|
||
|
ret
|
||
|
|
||
|
_rsqrt_full:
|
||
|
rsqrt_full:
|
||
|
movd xmm0, edi
|
||
|
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
por xmm0, [rip + penultimate_bit]
|
||
|
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad
|
||
|
|
||
|
sqrtss xmm0, xmm0
|
||
|
|
||
|
movd xmm1, [rip + one]
|
||
|
divss xmm1, xmm0
|
||
|
|
||
|
paddd xmm1, [rip + ultimate_bit]
|
||
|
pand xmm1, [rip + top_mask]
|
||
|
|
||
|
movd eax, xmm1
|
||
|
ret
|
||
|
|
||
|
_rsqrt_full_gpr:
|
||
|
rsqrt_full_gpr:
|
||
|
movd eax, xmm0 # Emulate regalloc mov
|
||
|
|
||
|
mov eax, edi
|
||
|
and eax, 0xFFFF8000
|
||
|
or eax, 0x00008000
|
||
|
|
||
|
movd xmm0, eax
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad
|
||
|
|
||
|
sqrtss xmm0, xmm0
|
||
|
|
||
|
movd xmm1, [rip + one]
|
||
|
divss xmm1, xmm0
|
||
|
movd eax, xmm1
|
||
|
|
||
|
add eax, 0x00004000
|
||
|
and eax, 0xffff8000
|
||
|
|
||
|
movd xmm0, eax # Emulate regalloc mov
|
||
|
ret
|
||
|
|
||
|
_rsqrt_full_nb2:
|
||
|
rsqrt_full_nb2:
|
||
|
movd xmm0, edi
|
||
|
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
por xmm0, [rip + penultimate_bit]
|
||
|
|
||
|
ucomiss xmm0, [rip + min_pos_denorm]
|
||
|
jna rsqrt_full_bad_new1
|
||
|
|
||
|
sqrtss xmm0, xmm0
|
||
|
|
||
|
movd xmm1, [rip + one]
|
||
|
divss xmm1, xmm0
|
||
|
|
||
|
paddd xmm1, [rip + ultimate_bit]
|
||
|
pand xmm1, [rip + top_mask]
|
||
|
|
||
|
movd eax, xmm1
|
||
|
ret
|
||
|
|
||
|
_rsqrt_full_nb:
|
||
|
rsqrt_full_nb:
|
||
|
movd xmm0, edi
|
||
|
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
por xmm0, [rip + penultimate_bit]
|
||
|
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad_new1
|
||
|
|
||
|
sqrtss xmm0, xmm0
|
||
|
|
||
|
movd xmm1, [rip + one]
|
||
|
divss xmm1, xmm0
|
||
|
|
||
|
paddd xmm1, [rip + ultimate_bit]
|
||
|
pand xmm1, [rip + top_mask]
|
||
|
|
||
|
movd eax, xmm1
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new1:
|
||
|
cmp edi, 0x00800000
|
||
|
jb rsqrt_full_bad_new_fallback1
|
||
|
|
||
|
movd xmm0, edi
|
||
|
rsqrtss xmm1, xmm0
|
||
|
|
||
|
ucomiss xmm1, xmm1
|
||
|
jp rsqrt_full_bad_new1_nan
|
||
|
|
||
|
movd eax, xmm1
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new_fallback1:
|
||
|
call _rsqrt_fallback
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new1_nan:
|
||
|
ucomiss xmm0, xmm0
|
||
|
jp rsqrt_full_bad_new1_nan_ret
|
||
|
|
||
|
mov eax, 0x7FC00000
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new1_nan_ret:
|
||
|
ret
|
||
|
|
||
|
_rsqrt_full_nb_gpr:
|
||
|
rsqrt_full_nb_gpr:
|
||
|
movd eax, xmm0 # Emulate regalloc mov
|
||
|
|
||
|
mov eax, edi
|
||
|
and eax, 0xFFFF8000
|
||
|
or eax, 0x00008000
|
||
|
|
||
|
movd xmm0, eax
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad_new2
|
||
|
|
||
|
sqrtss xmm0, xmm0
|
||
|
|
||
|
movd xmm1, [rip + one]
|
||
|
divss xmm1, xmm0
|
||
|
movd eax, xmm1
|
||
|
|
||
|
add eax, 0x00004000
|
||
|
and eax, 0xffff8000
|
||
|
|
||
|
movd xmm0, eax # Emulate regalloc mov
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new2:
|
||
|
cmp edi, 0x00800000
|
||
|
jb rsqrt_full_bad_new_fallback2
|
||
|
|
||
|
movd xmm0, edi
|
||
|
rsqrtss xmm1, xmm0
|
||
|
|
||
|
test edi, edi
|
||
|
js rsqrt_full_bad_new2_nan
|
||
|
|
||
|
movd eax, xmm1
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new_fallback2:
|
||
|
call _rsqrt_fallback
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad_new2_nan:
|
||
|
mov eax, 0x7FC00000
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_bad:
|
||
|
xorps xmm1, xmm1
|
||
|
movd xmm0, edi
|
||
|
ucomiss xmm0, xmm1
|
||
|
jp rsqrt_full_nan
|
||
|
je rsqrt_full_zero
|
||
|
jc rsqrt_full_neg
|
||
|
|
||
|
cmp edi, 0x7F800000
|
||
|
je rsqrt_full_inf
|
||
|
|
||
|
# TODO: Full Denormal Implementation
|
||
|
call _rsqrt_fallback
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_neg:
|
||
|
mov eax, 0x7FC00000
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_inf:
|
||
|
xor eax, eax
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_nan:
|
||
|
mov eax, edi
|
||
|
or eax, 0x00400000
|
||
|
ret
|
||
|
|
||
|
rsqrt_full_zero:
|
||
|
mov eax, edi
|
||
|
or eax, 0x7F800000
|
||
|
ret
|
||
|
|
||
|
_rsqrt_newton:
|
||
|
rsqrt_newton:
|
||
|
movd xmm0, edi
|
||
|
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
por xmm0, [rip + penultimate_bit]
|
||
|
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad
|
||
|
|
||
|
rsqrtps xmm1, xmm0
|
||
|
mulss xmm0, [rip + half]
|
||
|
vmulss xmm2, xmm1, xmm1
|
||
|
mulss xmm2, xmm0
|
||
|
movaps xmm0, [rip + one_point_five]
|
||
|
subss xmm0, xmm2
|
||
|
mulss xmm0, xmm1
|
||
|
|
||
|
paddd xmm0, [rip + ultimate_bit]
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
|
||
|
movd eax, xmm0
|
||
|
ret
|
||
|
|
||
|
_rsqrt_hack:
|
||
|
rsqrt_hack:
|
||
|
movd xmm9, edi
|
||
|
|
||
|
vpand xmm0, xmm9, [rip + top_mask]
|
||
|
por xmm0, [rip + penultimate_bit]
|
||
|
|
||
|
# detect NaNs, negatives, zeros, denormals and infinities
|
||
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
||
|
ptest xmm1, xmm1
|
||
|
jnz rsqrt_full_bad
|
||
|
|
||
|
# calculate x64 estimate
|
||
|
rsqrtps xmm0, xmm0
|
||
|
|
||
|
# calculate correction factor
|
||
|
vpslld xmm1, xmm9, 8
|
||
|
vpsrad xmm2, xmm1, 31
|
||
|
paddd xmm1, [rip + magic1]
|
||
|
pcmpgtd xmm1, [rip + magic2]
|
||
|
pxor xmm1, xmm2
|
||
|
movaps xmm2, [rip + magic3]
|
||
|
psubd xmm2, xmm1
|
||
|
|
||
|
# correct x64 estimate
|
||
|
paddd xmm0, xmm2
|
||
|
pand xmm0, [rip + top_mask]
|
||
|
|
||
|
movd eax, xmm0
|
||
|
ret
|