.global _rsqrt_inaccurate
.global rsqrt_inaccurate
.global _rsqrt_full
.global rsqrt_full
.global _rsqrt_full_gpr
.global rsqrt_full_gpr
.global _rsqrt_full_nb
.global rsqrt_full_nb
.global _rsqrt_full_nb2
.global rsqrt_full_nb2
.global _rsqrt_full_nb_gpr
.global rsqrt_full_nb_gpr
.global _rsqrt_newton
.global rsqrt_newton
.global _rsqrt_hack
.global rsqrt_hack
.global _rsqrt_fallback

.text
.intel_syntax noprefix

.align 16
min_pos_denorm:
.long 0x00800000,0,0,0
penultimate_bit:
.long 0x00008000,0,0,0
ultimate_bit:
.long 0x00004000,0,0,0
top_mask:
.long 0xFFFF8000,0,0,0
one:
.long 0x3f800000,0,0,0
half:
.long 0x3f000000,0,0,0
one_point_five:
.long 0x3fc00000,0,0,0
magic1:
.long 0x60000000,0,0,0
magic2:
.long 0x3c000000,0,0,0
magic3:
.long 0x000047ff,0,0,0

_rsqrt_inaccurate:
rsqrt_inaccurate:
    movd xmm0, edi

    rsqrtss xmm0, xmm0

    movd eax, xmm0
    ret

_rsqrt_full:
rsqrt_full:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

_rsqrt_full_gpr:
rsqrt_full_gpr:
    movd eax, xmm0 # Emulate regalloc mov

    mov eax, edi
    and eax, 0xFFFF8000
    or eax, 0x00008000

    movd xmm0, eax
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0
    movd eax, xmm1

    add eax, 0x00004000
    and eax, 0xffff8000

    movd xmm0, eax # Emulate regalloc mov
    ret

_rsqrt_full_nb2:
rsqrt_full_nb2:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    ucomiss xmm0, [rip + min_pos_denorm]
    jna rsqrt_full_bad_new1

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

_rsqrt_full_nb:
rsqrt_full_nb:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad_new1

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

rsqrt_full_bad_new1:
    cmp edi, 0x00800000
    jb rsqrt_full_bad_new_fallback1

    movd xmm0, edi
    rsqrtss xmm1, xmm0

    ucomiss xmm1, xmm1
    jp rsqrt_full_bad_new1_nan

    movd eax, xmm1
    ret

rsqrt_full_bad_new_fallback1:
    call _rsqrt_fallback
    ret

rsqrt_full_bad_new1_nan:
    ucomiss xmm0, xmm0
    jp rsqrt_full_bad_new1_nan_ret

    mov eax, 0x7FC00000
    ret

rsqrt_full_bad_new1_nan_ret:
    ret

_rsqrt_full_nb_gpr:
rsqrt_full_nb_gpr:
    movd eax, xmm0 # Emulate regalloc mov

    mov eax, edi
    and eax, 0xFFFF8000
    or eax, 0x00008000

    movd xmm0, eax
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad_new2

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0
    movd eax, xmm1

    add eax, 0x00004000
    and eax, 0xffff8000

    movd xmm0, eax # Emulate regalloc mov
    ret

rsqrt_full_bad_new2:
    cmp edi, 0x00800000
    jb rsqrt_full_bad_new_fallback2

    movd xmm0, edi
    rsqrtss xmm1, xmm0

    test edi, edi
    js rsqrt_full_bad_new2_nan

    movd eax, xmm1
    ret

rsqrt_full_bad_new_fallback2:
    call _rsqrt_fallback
    ret

rsqrt_full_bad_new2_nan:
    mov eax, 0x7FC00000
    ret

rsqrt_full_bad:
    xorps xmm1, xmm1
    movd xmm0, edi
    ucomiss xmm0, xmm1
    jp rsqrt_full_nan
    je rsqrt_full_zero
    jc rsqrt_full_neg

    cmp edi, 0x7F800000
    je rsqrt_full_inf

    # TODO: Full Denormal Implementation
    call _rsqrt_fallback
    ret

rsqrt_full_neg:
    mov eax, 0x7FC00000
    ret

rsqrt_full_inf:
    xor eax, eax
    ret

rsqrt_full_nan:
    mov eax, edi
    or eax, 0x00400000
    ret

rsqrt_full_zero:
    mov eax, edi
    or eax, 0x7F800000
    ret

_rsqrt_newton:
rsqrt_newton:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    rsqrtps xmm1, xmm0
    mulss xmm0, [rip + half]
    vmulss xmm2, xmm1, xmm1
    mulss xmm2, xmm0
    movaps xmm0, [rip + one_point_five]
    subss xmm0, xmm2
    mulss xmm0, xmm1

    paddd xmm0, [rip + ultimate_bit]
    pand xmm0, [rip + top_mask]

    movd eax, xmm0
    ret

_rsqrt_hack:
rsqrt_hack:
    movd xmm9, edi

    vpand xmm0, xmm9, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    # detect NaNs, negatives, zeros, denormals and infinities
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    # calculate x64 estimate
    rsqrtps xmm0, xmm0

    # calculate correction factor
    vpslld xmm1, xmm9, 8
    vpsrad xmm2, xmm1, 31
    paddd xmm1, [rip + magic1]
    pcmpgtd xmm1, [rip + magic2]
    pxor xmm1, xmm2
    movaps xmm2, [rip + magic3]
    psubd xmm2, xmm1

    # correct x64 estimate
    paddd xmm0, xmm2
    pand xmm0, [rip + top_mask]

    movd eax, xmm0
    ret