another try
This commit is contained in:
@@ -1,195 +1,195 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
||||
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <string_view>
|
||||
#include "common/bit_util.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/x64/cpu_detect.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
|
||||
#if defined(__DragonFly__) || defined(__FreeBSD__)
|
||||
// clang-format off
|
||||
#include <sys/types.h>
|
||||
#include <machine/cpufunc.h>
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
static inline void __cpuidex(int info[4], u32 function_id, u32 subfunction_id) {
|
||||
#if defined(__DragonFly__) || defined(__FreeBSD__)
|
||||
// Despite the name, this is just do_cpuid() with ECX as second input.
|
||||
cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
|
||||
#else
|
||||
info[0] = function_id; // eax
|
||||
info[2] = subfunction_id; // ecx
|
||||
__asm__("cpuid"
|
||||
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
|
||||
: "a"(function_id), "c"(subfunction_id));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __cpuid(int info[4], u32 function_id) {
|
||||
return __cpuidex(info, function_id, 0);
|
||||
}
|
||||
|
||||
#define _XCR_XFEATURE_ENABLED_MASK 0
|
||||
static inline u64 _xgetbv(u32 index) {
|
||||
u32 eax, edx;
|
||||
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
||||
return ((u64)edx << 32) | eax;
|
||||
}
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
namespace Common {
|
||||
|
||||
CPUCaps::Manufacturer CPUCaps::ParseManufacturer(std::string_view brand_string) {
|
||||
if (brand_string == "GenuineIntel") {
|
||||
return Manufacturer::Intel;
|
||||
} else if (brand_string == "AuthenticAMD") {
|
||||
return Manufacturer::AMD;
|
||||
} else if (brand_string == "HygonGenuine") {
|
||||
return Manufacturer::Hygon;
|
||||
}
|
||||
return Manufacturer::Unknown;
|
||||
}
|
||||
|
||||
// Detects the various CPU features
|
||||
static CPUCaps Detect() {
|
||||
CPUCaps caps = {};
|
||||
|
||||
// Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
|
||||
// yuzu at all anyway
|
||||
|
||||
int cpu_id[4];
|
||||
|
||||
// Detect CPU's CPUID capabilities and grab manufacturer string
|
||||
__cpuid(cpu_id, 0x00000000);
|
||||
const u32 max_std_fn = cpu_id[0]; // EAX
|
||||
|
||||
std::memset(caps.brand_string, 0, std::size(caps.brand_string));
|
||||
std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(u32));
|
||||
std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(u32));
|
||||
std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(u32));
|
||||
|
||||
caps.manufacturer = CPUCaps::ParseManufacturer(caps.brand_string);
|
||||
|
||||
// Set reasonable default cpu string even if brand string not available
|
||||
std::strncpy(caps.cpu_string, caps.brand_string, std::size(caps.brand_string));
|
||||
|
||||
__cpuid(cpu_id, 0x80000000);
|
||||
|
||||
const u32 max_ex_fn = cpu_id[0];
|
||||
|
||||
// Detect family and other miscellaneous features
|
||||
if (max_std_fn >= 1) {
|
||||
__cpuid(cpu_id, 0x00000001);
|
||||
caps.sse = Common::Bit<25>(cpu_id[3]);
|
||||
caps.sse2 = Common::Bit<26>(cpu_id[3]);
|
||||
caps.sse3 = Common::Bit<0>(cpu_id[2]);
|
||||
caps.pclmulqdq = Common::Bit<1>(cpu_id[2]);
|
||||
caps.ssse3 = Common::Bit<9>(cpu_id[2]);
|
||||
caps.sse4_1 = Common::Bit<19>(cpu_id[2]);
|
||||
caps.sse4_2 = Common::Bit<20>(cpu_id[2]);
|
||||
caps.movbe = Common::Bit<22>(cpu_id[2]);
|
||||
caps.popcnt = Common::Bit<23>(cpu_id[2]);
|
||||
caps.aes = Common::Bit<25>(cpu_id[2]);
|
||||
caps.f16c = Common::Bit<29>(cpu_id[2]);
|
||||
|
||||
// AVX support requires 3 separate checks:
|
||||
// - Is the AVX bit set in CPUID?
|
||||
// - Is the XSAVE bit set in CPUID?
|
||||
// - XGETBV result has the XCR bit set.
|
||||
if (Common::Bit<28>(cpu_id[2]) && Common::Bit<27>(cpu_id[2])) {
|
||||
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
|
||||
caps.avx = true;
|
||||
if (Common::Bit<12>(cpu_id[2]))
|
||||
caps.fma = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_std_fn >= 7) {
|
||||
__cpuidex(cpu_id, 0x00000007, 0x00000000);
|
||||
// Can't enable AVX{2,512} unless the XSAVE/XGETBV checks above passed
|
||||
if (caps.avx) {
|
||||
caps.avx2 = Common::Bit<5>(cpu_id[1]);
|
||||
caps.avx512f = Common::Bit<16>(cpu_id[1]);
|
||||
caps.avx512dq = Common::Bit<17>(cpu_id[1]);
|
||||
caps.avx512cd = Common::Bit<28>(cpu_id[1]);
|
||||
caps.avx512bw = Common::Bit<30>(cpu_id[1]);
|
||||
caps.avx512vl = Common::Bit<31>(cpu_id[1]);
|
||||
caps.avx512vbmi = Common::Bit<1>(cpu_id[2]);
|
||||
caps.avx512bitalg = Common::Bit<12>(cpu_id[2]);
|
||||
}
|
||||
|
||||
caps.bmi1 = Common::Bit<3>(cpu_id[1]);
|
||||
caps.bmi2 = Common::Bit<8>(cpu_id[1]);
|
||||
caps.sha = Common::Bit<29>(cpu_id[1]);
|
||||
|
||||
caps.gfni = Common::Bit<8>(cpu_id[2]);
|
||||
|
||||
__cpuidex(cpu_id, 0x00000007, 0x00000001);
|
||||
caps.avx_vnni = caps.avx && Common::Bit<4>(cpu_id[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000004) {
|
||||
// Extract CPU model string
|
||||
__cpuid(cpu_id, 0x80000002);
|
||||
std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id));
|
||||
__cpuid(cpu_id, 0x80000003);
|
||||
std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id));
|
||||
__cpuid(cpu_id, 0x80000004);
|
||||
std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id));
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000001) {
|
||||
// Check for more features
|
||||
__cpuid(cpu_id, 0x80000001);
|
||||
caps.lzcnt = Common::Bit<5>(cpu_id[2]);
|
||||
caps.fma4 = Common::Bit<16>(cpu_id[2]);
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000007) {
|
||||
__cpuid(cpu_id, 0x80000007);
|
||||
caps.invariant_tsc = Common::Bit<8>(cpu_id[3]);
|
||||
}
|
||||
|
||||
if (max_std_fn >= 0x15) {
|
||||
__cpuid(cpu_id, 0x15);
|
||||
caps.tsc_crystal_ratio_denominator = cpu_id[0];
|
||||
caps.tsc_crystal_ratio_numerator = cpu_id[1];
|
||||
caps.crystal_frequency = cpu_id[2];
|
||||
// Some CPU models might not return a crystal frequency.
|
||||
// The CPU model can be detected to use the values from turbostat
|
||||
// https://github.com/torvalds/linux/blob/master/tools/power/x86/turbostat/turbostat.c#L5569
|
||||
// but it's easier to just estimate the TSC tick rate for these cases.
|
||||
if (caps.tsc_crystal_ratio_denominator) {
|
||||
caps.tsc_frequency = static_cast<u64>(caps.crystal_frequency) *
|
||||
caps.tsc_crystal_ratio_numerator /
|
||||
caps.tsc_crystal_ratio_denominator;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_std_fn >= 0x16) {
|
||||
__cpuid(cpu_id, 0x16);
|
||||
caps.base_frequency = cpu_id[0];
|
||||
caps.max_frequency = cpu_id[1];
|
||||
caps.bus_frequency = cpu_id[2];
|
||||
}
|
||||
|
||||
return caps;
|
||||
}
|
||||
|
||||
const CPUCaps& GetCPUCaps() {
|
||||
static CPUCaps caps = Detect();
|
||||
return caps;
|
||||
}
|
||||
|
||||
} // namespace Common
|
||||
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
||||
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <string_view>
|
||||
#include "common/bit_util.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/x64/cpu_detect.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
|
||||
#if defined(__DragonFly__) || defined(__FreeBSD__)
|
||||
// clang-format off
|
||||
#include <sys/types.h>
|
||||
#include <machine/cpufunc.h>
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
static inline void __cpuidex(int info[4], u32 function_id, u32 subfunction_id) {
|
||||
#if defined(__DragonFly__) || defined(__FreeBSD__)
|
||||
// Despite the name, this is just do_cpuid() with ECX as second input.
|
||||
cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
|
||||
#else
|
||||
info[0] = function_id; // eax
|
||||
info[2] = subfunction_id; // ecx
|
||||
__asm__("cpuid"
|
||||
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
|
||||
: "a"(function_id), "c"(subfunction_id));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __cpuid(int info[4], u32 function_id) {
|
||||
return __cpuidex(info, function_id, 0);
|
||||
}
|
||||
|
||||
#define _XCR_XFEATURE_ENABLED_MASK 0
|
||||
static inline u64 _xgetbv(u32 index) {
|
||||
u32 eax, edx;
|
||||
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
||||
return ((u64)edx << 32) | eax;
|
||||
}
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
namespace Common {
|
||||
|
||||
CPUCaps::Manufacturer CPUCaps::ParseManufacturer(std::string_view brand_string) {
|
||||
if (brand_string == "GenuineIntel") {
|
||||
return Manufacturer::Intel;
|
||||
} else if (brand_string == "AuthenticAMD") {
|
||||
return Manufacturer::AMD;
|
||||
} else if (brand_string == "HygonGenuine") {
|
||||
return Manufacturer::Hygon;
|
||||
}
|
||||
return Manufacturer::Unknown;
|
||||
}
|
||||
|
||||
// Detects the various CPU features
|
||||
static CPUCaps Detect() {
|
||||
CPUCaps caps = {};
|
||||
|
||||
// Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
|
||||
// yuzu at all anyway
|
||||
|
||||
int cpu_id[4];
|
||||
|
||||
// Detect CPU's CPUID capabilities and grab manufacturer string
|
||||
__cpuid(cpu_id, 0x00000000);
|
||||
const u32 max_std_fn = cpu_id[0]; // EAX
|
||||
|
||||
std::memset(caps.brand_string, 0, std::size(caps.brand_string));
|
||||
std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(u32));
|
||||
std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(u32));
|
||||
std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(u32));
|
||||
|
||||
caps.manufacturer = CPUCaps::ParseManufacturer(caps.brand_string);
|
||||
|
||||
// Set reasonable default cpu string even if brand string not available
|
||||
std::strncpy(caps.cpu_string, caps.brand_string, std::size(caps.brand_string));
|
||||
|
||||
__cpuid(cpu_id, 0x80000000);
|
||||
|
||||
const u32 max_ex_fn = cpu_id[0];
|
||||
|
||||
// Detect family and other miscellaneous features
|
||||
if (max_std_fn >= 1) {
|
||||
__cpuid(cpu_id, 0x00000001);
|
||||
caps.sse = Common::Bit<25>(cpu_id[3]);
|
||||
caps.sse2 = Common::Bit<26>(cpu_id[3]);
|
||||
caps.sse3 = Common::Bit<0>(cpu_id[2]);
|
||||
caps.pclmulqdq = Common::Bit<1>(cpu_id[2]);
|
||||
caps.ssse3 = Common::Bit<9>(cpu_id[2]);
|
||||
caps.sse4_1 = Common::Bit<19>(cpu_id[2]);
|
||||
caps.sse4_2 = Common::Bit<20>(cpu_id[2]);
|
||||
caps.movbe = Common::Bit<22>(cpu_id[2]);
|
||||
caps.popcnt = Common::Bit<23>(cpu_id[2]);
|
||||
caps.aes = Common::Bit<25>(cpu_id[2]);
|
||||
caps.f16c = Common::Bit<29>(cpu_id[2]);
|
||||
|
||||
// AVX support requires 3 separate checks:
|
||||
// - Is the AVX bit set in CPUID?
|
||||
// - Is the XSAVE bit set in CPUID?
|
||||
// - XGETBV result has the XCR bit set.
|
||||
if (Common::Bit<28>(cpu_id[2]) && Common::Bit<27>(cpu_id[2])) {
|
||||
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
|
||||
caps.avx = true;
|
||||
if (Common::Bit<12>(cpu_id[2]))
|
||||
caps.fma = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_std_fn >= 7) {
|
||||
__cpuidex(cpu_id, 0x00000007, 0x00000000);
|
||||
// Can't enable AVX{2,512} unless the XSAVE/XGETBV checks above passed
|
||||
if (caps.avx) {
|
||||
caps.avx2 = Common::Bit<5>(cpu_id[1]);
|
||||
caps.avx512f = Common::Bit<16>(cpu_id[1]);
|
||||
caps.avx512dq = Common::Bit<17>(cpu_id[1]);
|
||||
caps.avx512cd = Common::Bit<28>(cpu_id[1]);
|
||||
caps.avx512bw = Common::Bit<30>(cpu_id[1]);
|
||||
caps.avx512vl = Common::Bit<31>(cpu_id[1]);
|
||||
caps.avx512vbmi = Common::Bit<1>(cpu_id[2]);
|
||||
caps.avx512bitalg = Common::Bit<12>(cpu_id[2]);
|
||||
}
|
||||
|
||||
caps.bmi1 = Common::Bit<3>(cpu_id[1]);
|
||||
caps.bmi2 = Common::Bit<8>(cpu_id[1]);
|
||||
caps.sha = Common::Bit<29>(cpu_id[1]);
|
||||
|
||||
caps.gfni = Common::Bit<8>(cpu_id[2]);
|
||||
|
||||
__cpuidex(cpu_id, 0x00000007, 0x00000001);
|
||||
caps.avx_vnni = caps.avx && Common::Bit<4>(cpu_id[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000004) {
|
||||
// Extract CPU model string
|
||||
__cpuid(cpu_id, 0x80000002);
|
||||
std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id));
|
||||
__cpuid(cpu_id, 0x80000003);
|
||||
std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id));
|
||||
__cpuid(cpu_id, 0x80000004);
|
||||
std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id));
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000001) {
|
||||
// Check for more features
|
||||
__cpuid(cpu_id, 0x80000001);
|
||||
caps.lzcnt = Common::Bit<5>(cpu_id[2]);
|
||||
caps.fma4 = Common::Bit<16>(cpu_id[2]);
|
||||
}
|
||||
|
||||
if (max_ex_fn >= 0x80000007) {
|
||||
__cpuid(cpu_id, 0x80000007);
|
||||
caps.invariant_tsc = Common::Bit<8>(cpu_id[3]);
|
||||
}
|
||||
|
||||
if (max_std_fn >= 0x15) {
|
||||
__cpuid(cpu_id, 0x15);
|
||||
caps.tsc_crystal_ratio_denominator = cpu_id[0];
|
||||
caps.tsc_crystal_ratio_numerator = cpu_id[1];
|
||||
caps.crystal_frequency = cpu_id[2];
|
||||
// Some CPU models might not return a crystal frequency.
|
||||
// The CPU model can be detected to use the values from turbostat
|
||||
// https://github.com/torvalds/linux/blob/master/tools/power/x86/turbostat/turbostat.c#L5569
|
||||
// but it's easier to just estimate the TSC tick rate for these cases.
|
||||
if (caps.tsc_crystal_ratio_denominator) {
|
||||
caps.tsc_frequency = static_cast<u64>(caps.crystal_frequency) *
|
||||
caps.tsc_crystal_ratio_numerator /
|
||||
caps.tsc_crystal_ratio_denominator;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_std_fn >= 0x16) {
|
||||
__cpuid(cpu_id, 0x16);
|
||||
caps.base_frequency = cpu_id[0];
|
||||
caps.max_frequency = cpu_id[1];
|
||||
caps.bus_frequency = cpu_id[2];
|
||||
}
|
||||
|
||||
return caps;
|
||||
}
|
||||
|
||||
const CPUCaps& GetCPUCaps() {
|
||||
static CPUCaps caps = Detect();
|
||||
return caps;
|
||||
}
|
||||
|
||||
} // namespace Common
|
||||
|
||||
@@ -1,77 +1,77 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
||||
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
/// x86/x64 CPU capabilities that may be detected by this module
|
||||
struct CPUCaps {
|
||||
|
||||
enum class Manufacturer : u8 {
|
||||
Unknown = 0,
|
||||
Intel = 1,
|
||||
AMD = 2,
|
||||
Hygon = 3,
|
||||
};
|
||||
|
||||
static Manufacturer ParseManufacturer(std::string_view brand_string);
|
||||
|
||||
Manufacturer manufacturer;
|
||||
char brand_string[13];
|
||||
|
||||
char cpu_string[48];
|
||||
|
||||
u32 base_frequency;
|
||||
u32 max_frequency;
|
||||
u32 bus_frequency;
|
||||
|
||||
u32 tsc_crystal_ratio_denominator;
|
||||
u32 tsc_crystal_ratio_numerator;
|
||||
u32 crystal_frequency;
|
||||
u64 tsc_frequency; // Derived from the above three values
|
||||
|
||||
bool sse : 1;
|
||||
bool sse2 : 1;
|
||||
bool sse3 : 1;
|
||||
bool ssse3 : 1;
|
||||
bool sse4_1 : 1;
|
||||
bool sse4_2 : 1;
|
||||
|
||||
bool avx : 1;
|
||||
bool avx_vnni : 1;
|
||||
bool avx2 : 1;
|
||||
bool avx512f : 1;
|
||||
bool avx512dq : 1;
|
||||
bool avx512cd : 1;
|
||||
bool avx512bw : 1;
|
||||
bool avx512vl : 1;
|
||||
bool avx512vbmi : 1;
|
||||
bool avx512bitalg : 1;
|
||||
|
||||
bool aes : 1;
|
||||
bool bmi1 : 1;
|
||||
bool bmi2 : 1;
|
||||
bool f16c : 1;
|
||||
bool fma : 1;
|
||||
bool fma4 : 1;
|
||||
bool gfni : 1;
|
||||
bool invariant_tsc : 1;
|
||||
bool lzcnt : 1;
|
||||
bool movbe : 1;
|
||||
bool pclmulqdq : 1;
|
||||
bool popcnt : 1;
|
||||
bool sha : 1;
|
||||
};
|
||||
|
||||
/**
|
||||
* Gets the supported capabilities of the host CPU
|
||||
* @return Reference to a CPUCaps struct with the detected host CPU capabilities
|
||||
*/
|
||||
const CPUCaps& GetCPUCaps();
|
||||
|
||||
} // namespace Common
|
||||
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
|
||||
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
/// x86/x64 CPU capabilities that may be detected by this module
|
||||
struct CPUCaps {
|
||||
|
||||
enum class Manufacturer : u8 {
|
||||
Unknown = 0,
|
||||
Intel = 1,
|
||||
AMD = 2,
|
||||
Hygon = 3,
|
||||
};
|
||||
|
||||
static Manufacturer ParseManufacturer(std::string_view brand_string);
|
||||
|
||||
Manufacturer manufacturer;
|
||||
char brand_string[13];
|
||||
|
||||
char cpu_string[48];
|
||||
|
||||
u32 base_frequency;
|
||||
u32 max_frequency;
|
||||
u32 bus_frequency;
|
||||
|
||||
u32 tsc_crystal_ratio_denominator;
|
||||
u32 tsc_crystal_ratio_numerator;
|
||||
u32 crystal_frequency;
|
||||
u64 tsc_frequency; // Derived from the above three values
|
||||
|
||||
bool sse : 1;
|
||||
bool sse2 : 1;
|
||||
bool sse3 : 1;
|
||||
bool ssse3 : 1;
|
||||
bool sse4_1 : 1;
|
||||
bool sse4_2 : 1;
|
||||
|
||||
bool avx : 1;
|
||||
bool avx_vnni : 1;
|
||||
bool avx2 : 1;
|
||||
bool avx512f : 1;
|
||||
bool avx512dq : 1;
|
||||
bool avx512cd : 1;
|
||||
bool avx512bw : 1;
|
||||
bool avx512vl : 1;
|
||||
bool avx512vbmi : 1;
|
||||
bool avx512bitalg : 1;
|
||||
|
||||
bool aes : 1;
|
||||
bool bmi1 : 1;
|
||||
bool bmi2 : 1;
|
||||
bool f16c : 1;
|
||||
bool fma : 1;
|
||||
bool fma4 : 1;
|
||||
bool gfni : 1;
|
||||
bool invariant_tsc : 1;
|
||||
bool lzcnt : 1;
|
||||
bool movbe : 1;
|
||||
bool pclmulqdq : 1;
|
||||
bool popcnt : 1;
|
||||
bool sha : 1;
|
||||
};
|
||||
|
||||
/**
|
||||
* Gets the supported capabilities of the host CPU
|
||||
* @return Reference to a CPUCaps struct with the detected host CPU capabilities
|
||||
*/
|
||||
const CPUCaps& GetCPUCaps();
|
||||
|
||||
} // namespace Common
|
||||
|
||||
@@ -1,136 +1,136 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#include "common/atomic_ops.h"
|
||||
#include "common/uint128.h"
|
||||
#include "common/x64/native_clock.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
namespace Common {
|
||||
|
||||
#ifdef _MSC_VER
|
||||
__forceinline static u64 FencedRDTSC() {
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
const u64 result = __rdtsc();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
static u64 FencedRDTSC() {
|
||||
u64 result;
|
||||
asm volatile("lfence\n\t"
|
||||
"rdtsc\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(result)
|
||||
:
|
||||
: "rdx", "memory", "cc");
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
u64 EstimateRDTSCFrequency() {
|
||||
// Discard the first result measuring the rdtsc.
|
||||
FencedRDTSC();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{1});
|
||||
FencedRDTSC();
|
||||
|
||||
// Get the current time.
|
||||
const auto start_time = std::chrono::steady_clock::now();
|
||||
const u64 tsc_start = FencedRDTSC();
|
||||
// Wait for 200 milliseconds.
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{200});
|
||||
const auto end_time = std::chrono::steady_clock::now();
|
||||
const u64 tsc_end = FencedRDTSC();
|
||||
// Calculate differences.
|
||||
const u64 timer_diff = static_cast<u64>(
|
||||
std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count());
|
||||
const u64 tsc_diff = tsc_end - tsc_start;
|
||||
const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff);
|
||||
return tsc_freq;
|
||||
}
|
||||
|
||||
namespace X64 {
|
||||
NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
|
||||
u64 rtsc_frequency_)
|
||||
: WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
|
||||
rtsc_frequency_} {
|
||||
time_point.inner.last_measure = FencedRDTSC();
|
||||
time_point.inner.accumulated_ticks = 0U;
|
||||
ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency);
|
||||
us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency);
|
||||
ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency);
|
||||
clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
|
||||
cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
|
||||
}
|
||||
|
||||
u64 NativeClock::GetRTSC() {
|
||||
TimePoint new_time_point{};
|
||||
TimePoint current_time_point{};
|
||||
|
||||
current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
|
||||
do {
|
||||
const u64 current_measure = FencedRDTSC();
|
||||
u64 diff = current_measure - current_time_point.inner.last_measure;
|
||||
diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
|
||||
new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
|
||||
? current_measure
|
||||
: current_time_point.inner.last_measure;
|
||||
new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
|
||||
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
|
||||
current_time_point.pack, current_time_point.pack));
|
||||
return new_time_point.inner.accumulated_ticks;
|
||||
}
|
||||
|
||||
void NativeClock::Pause(bool is_paused) {
|
||||
if (!is_paused) {
|
||||
TimePoint current_time_point{};
|
||||
TimePoint new_time_point{};
|
||||
|
||||
current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
|
||||
do {
|
||||
new_time_point.pack = current_time_point.pack;
|
||||
new_time_point.inner.last_measure = FencedRDTSC();
|
||||
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
|
||||
current_time_point.pack, current_time_point.pack));
|
||||
}
|
||||
}
|
||||
|
||||
std::chrono::nanoseconds NativeClock::GetTimeNS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
|
||||
}
|
||||
|
||||
std::chrono::microseconds NativeClock::GetTimeUS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
|
||||
}
|
||||
|
||||
std::chrono::milliseconds NativeClock::GetTimeMS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
|
||||
}
|
||||
|
||||
u64 NativeClock::GetClockCycles() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return MultiplyHigh(rtsc_value, clock_rtsc_factor);
|
||||
}
|
||||
|
||||
u64 NativeClock::GetCPUCycles() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
|
||||
}
|
||||
|
||||
} // namespace X64
|
||||
|
||||
} // namespace Common
|
||||
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#include "common/atomic_ops.h"
|
||||
#include "common/uint128.h"
|
||||
#include "common/x64/native_clock.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
namespace Common {
|
||||
|
||||
#ifdef _MSC_VER
|
||||
__forceinline static u64 FencedRDTSC() {
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
const u64 result = __rdtsc();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
static u64 FencedRDTSC() {
|
||||
u64 result;
|
||||
asm volatile("lfence\n\t"
|
||||
"rdtsc\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(result)
|
||||
:
|
||||
: "rdx", "memory", "cc");
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
u64 EstimateRDTSCFrequency() {
|
||||
// Discard the first result measuring the rdtsc.
|
||||
FencedRDTSC();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{1});
|
||||
FencedRDTSC();
|
||||
|
||||
// Get the current time.
|
||||
const auto start_time = std::chrono::steady_clock::now();
|
||||
const u64 tsc_start = FencedRDTSC();
|
||||
// Wait for 200 milliseconds.
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{200});
|
||||
const auto end_time = std::chrono::steady_clock::now();
|
||||
const u64 tsc_end = FencedRDTSC();
|
||||
// Calculate differences.
|
||||
const u64 timer_diff = static_cast<u64>(
|
||||
std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count());
|
||||
const u64 tsc_diff = tsc_end - tsc_start;
|
||||
const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff);
|
||||
return tsc_freq;
|
||||
}
|
||||
|
||||
namespace X64 {
|
||||
NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
|
||||
u64 rtsc_frequency_)
|
||||
: WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
|
||||
rtsc_frequency_} {
|
||||
time_point.inner.last_measure = FencedRDTSC();
|
||||
time_point.inner.accumulated_ticks = 0U;
|
||||
ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency);
|
||||
us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency);
|
||||
ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency);
|
||||
clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
|
||||
cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
|
||||
}
|
||||
|
||||
u64 NativeClock::GetRTSC() {
|
||||
TimePoint new_time_point{};
|
||||
TimePoint current_time_point{};
|
||||
|
||||
current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
|
||||
do {
|
||||
const u64 current_measure = FencedRDTSC();
|
||||
u64 diff = current_measure - current_time_point.inner.last_measure;
|
||||
diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
|
||||
new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
|
||||
? current_measure
|
||||
: current_time_point.inner.last_measure;
|
||||
new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
|
||||
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
|
||||
current_time_point.pack, current_time_point.pack));
|
||||
return new_time_point.inner.accumulated_ticks;
|
||||
}
|
||||
|
||||
void NativeClock::Pause(bool is_paused) {
|
||||
if (!is_paused) {
|
||||
TimePoint current_time_point{};
|
||||
TimePoint new_time_point{};
|
||||
|
||||
current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
|
||||
do {
|
||||
new_time_point.pack = current_time_point.pack;
|
||||
new_time_point.inner.last_measure = FencedRDTSC();
|
||||
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
|
||||
current_time_point.pack, current_time_point.pack));
|
||||
}
|
||||
}
|
||||
|
||||
std::chrono::nanoseconds NativeClock::GetTimeNS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
|
||||
}
|
||||
|
||||
std::chrono::microseconds NativeClock::GetTimeUS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
|
||||
}
|
||||
|
||||
std::chrono::milliseconds NativeClock::GetTimeMS() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
|
||||
}
|
||||
|
||||
u64 NativeClock::GetClockCycles() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return MultiplyHigh(rtsc_value, clock_rtsc_factor);
|
||||
}
|
||||
|
||||
u64 NativeClock::GetCPUCycles() {
|
||||
const u64 rtsc_value = GetRTSC();
|
||||
return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
|
||||
}
|
||||
|
||||
} // namespace X64
|
||||
|
||||
} // namespace Common
|
||||
|
||||
@@ -1,55 +1,55 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/wall_clock.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
namespace X64 {
|
||||
class NativeClock final : public WallClock {
|
||||
public:
|
||||
explicit NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
|
||||
u64 rtsc_frequency_);
|
||||
|
||||
std::chrono::nanoseconds GetTimeNS() override;
|
||||
|
||||
std::chrono::microseconds GetTimeUS() override;
|
||||
|
||||
std::chrono::milliseconds GetTimeMS() override;
|
||||
|
||||
u64 GetClockCycles() override;
|
||||
|
||||
u64 GetCPUCycles() override;
|
||||
|
||||
void Pause(bool is_paused) override;
|
||||
|
||||
private:
|
||||
u64 GetRTSC();
|
||||
|
||||
union alignas(16) TimePoint {
|
||||
TimePoint() : pack{} {}
|
||||
u128 pack{};
|
||||
struct Inner {
|
||||
u64 last_measure{};
|
||||
u64 accumulated_ticks{};
|
||||
} inner;
|
||||
};
|
||||
|
||||
TimePoint time_point;
|
||||
|
||||
// factors
|
||||
u64 clock_rtsc_factor{};
|
||||
u64 cpu_rtsc_factor{};
|
||||
u64 ns_rtsc_factor{};
|
||||
u64 us_rtsc_factor{};
|
||||
u64 ms_rtsc_factor{};
|
||||
|
||||
u64 rtsc_frequency;
|
||||
};
|
||||
} // namespace X64
|
||||
|
||||
u64 EstimateRDTSCFrequency();
|
||||
|
||||
} // namespace Common
|
||||
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/wall_clock.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
namespace X64 {
|
||||
class NativeClock final : public WallClock {
|
||||
public:
|
||||
explicit NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_,
|
||||
u64 rtsc_frequency_);
|
||||
|
||||
std::chrono::nanoseconds GetTimeNS() override;
|
||||
|
||||
std::chrono::microseconds GetTimeUS() override;
|
||||
|
||||
std::chrono::milliseconds GetTimeMS() override;
|
||||
|
||||
u64 GetClockCycles() override;
|
||||
|
||||
u64 GetCPUCycles() override;
|
||||
|
||||
void Pause(bool is_paused) override;
|
||||
|
||||
private:
|
||||
u64 GetRTSC();
|
||||
|
||||
union alignas(16) TimePoint {
|
||||
TimePoint() : pack{} {}
|
||||
u128 pack{};
|
||||
struct Inner {
|
||||
u64 last_measure{};
|
||||
u64 accumulated_ticks{};
|
||||
} inner;
|
||||
};
|
||||
|
||||
TimePoint time_point;
|
||||
|
||||
// factors
|
||||
u64 clock_rtsc_factor{};
|
||||
u64 cpu_rtsc_factor{};
|
||||
u64 ns_rtsc_factor{};
|
||||
u64 us_rtsc_factor{};
|
||||
u64 ms_rtsc_factor{};
|
||||
|
||||
u64 rtsc_frequency;
|
||||
};
|
||||
} // namespace X64
|
||||
|
||||
u64 EstimateRDTSCFrequency();
|
||||
|
||||
} // namespace Common
|
||||
|
||||
@@ -1,228 +1,228 @@
|
||||
// SPDX-FileCopyrightText: 2016 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bitset>
|
||||
#include <initializer_list>
|
||||
#include <xbyak/xbyak.h>
|
||||
#include "common/assert.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
constexpr size_t RegToIndex(const Xbyak::Reg& reg) {
|
||||
using Kind = Xbyak::Reg::Kind;
|
||||
ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
|
||||
"RegSet only support GPRs and XMM registers.");
|
||||
ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
|
||||
return static_cast<size_t>(reg.getIdx()) + (reg.getKind() == Kind::REG ? 0 : 16);
|
||||
}
|
||||
|
||||
constexpr Xbyak::Reg64 IndexToReg64(size_t reg_index) {
|
||||
ASSERT(reg_index < 16);
|
||||
return Xbyak::Reg64(static_cast<int>(reg_index));
|
||||
}
|
||||
|
||||
constexpr Xbyak::Xmm IndexToXmm(size_t reg_index) {
|
||||
ASSERT(reg_index >= 16 && reg_index < 32);
|
||||
return Xbyak::Xmm(static_cast<int>(reg_index - 16));
|
||||
}
|
||||
|
||||
constexpr Xbyak::Reg IndexToReg(size_t reg_index) {
|
||||
if (reg_index < 16) {
|
||||
return IndexToReg64(reg_index);
|
||||
} else {
|
||||
return IndexToXmm(reg_index);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
|
||||
size_t bits = 0;
|
||||
for (const Xbyak::Reg& reg : regs) {
|
||||
bits |= size_t{1} << RegToIndex(reg);
|
||||
}
|
||||
return {bits};
|
||||
}
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
|
||||
constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// Microsoft x64 ABI
|
||||
constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
});
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
// XMMs
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0x20;
|
||||
|
||||
#else
|
||||
|
||||
// System V x86-64 ABI
|
||||
constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0;
|
||||
|
||||
#endif
|
||||
|
||||
struct ABIFrameInfo {
|
||||
s32 subtraction;
|
||||
s32 xmm_offset;
|
||||
};
|
||||
|
||||
inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
|
||||
size_t needed_frame_size) {
|
||||
const auto count = (regs & ABI_ALL_GPRS).count();
|
||||
rsp_alignment -= count * 8;
|
||||
size_t subtraction = 0;
|
||||
const auto xmm_count = (regs & ABI_ALL_XMMS).count();
|
||||
if (xmm_count) {
|
||||
// If we have any XMMs to save, we must align the stack here.
|
||||
subtraction = rsp_alignment & 0xF;
|
||||
}
|
||||
subtraction += 0x10 * xmm_count;
|
||||
size_t xmm_base_subtraction = subtraction;
|
||||
subtraction += needed_frame_size;
|
||||
subtraction += ABI_SHADOW_SPACE;
|
||||
// Final alignment.
|
||||
rsp_alignment -= subtraction;
|
||||
subtraction += rsp_alignment & 0xF;
|
||||
|
||||
return ABIFrameInfo{static_cast<s32>(subtraction),
|
||||
static_cast<s32>(subtraction - xmm_base_subtraction)};
|
||||
}
|
||||
|
||||
inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.push(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (frame_info.subtraction != 0) {
|
||||
code.sub(code.rsp, frame_info.subtraction);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
|
||||
frame_info.xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
return ABI_SHADOW_SPACE;
|
||||
}
|
||||
|
||||
inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
|
||||
frame_info.xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
if (frame_info.subtraction != 0) {
|
||||
code.add(code.rsp, frame_info.subtraction);
|
||||
}
|
||||
|
||||
// GPRs need to be popped in reverse order
|
||||
for (size_t j = 0; j < regs.size(); ++j) {
|
||||
const size_t i = regs.size() - j - 1;
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.pop(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
// SPDX-FileCopyrightText: 2016 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bitset>
|
||||
#include <initializer_list>
|
||||
#include <xbyak/xbyak.h>
|
||||
#include "common/assert.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
constexpr size_t RegToIndex(const Xbyak::Reg& reg) {
|
||||
using Kind = Xbyak::Reg::Kind;
|
||||
ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
|
||||
"RegSet only support GPRs and XMM registers.");
|
||||
ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
|
||||
return static_cast<size_t>(reg.getIdx()) + (reg.getKind() == Kind::REG ? 0 : 16);
|
||||
}
|
||||
|
||||
constexpr Xbyak::Reg64 IndexToReg64(size_t reg_index) {
|
||||
ASSERT(reg_index < 16);
|
||||
return Xbyak::Reg64(static_cast<int>(reg_index));
|
||||
}
|
||||
|
||||
constexpr Xbyak::Xmm IndexToXmm(size_t reg_index) {
|
||||
ASSERT(reg_index >= 16 && reg_index < 32);
|
||||
return Xbyak::Xmm(static_cast<int>(reg_index - 16));
|
||||
}
|
||||
|
||||
constexpr Xbyak::Reg IndexToReg(size_t reg_index) {
|
||||
if (reg_index < 16) {
|
||||
return IndexToReg64(reg_index);
|
||||
} else {
|
||||
return IndexToXmm(reg_index);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
|
||||
size_t bits = 0;
|
||||
for (const Xbyak::Reg& reg : regs) {
|
||||
bits |= size_t{1} << RegToIndex(reg);
|
||||
}
|
||||
return {bits};
|
||||
}
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
|
||||
constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// Microsoft x64 ABI
|
||||
constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
});
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
// XMMs
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0x20;
|
||||
|
||||
#else
|
||||
|
||||
// System V x86-64 ABI
|
||||
constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
|
||||
constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0;
|
||||
|
||||
#endif
|
||||
|
||||
struct ABIFrameInfo {
|
||||
s32 subtraction;
|
||||
s32 xmm_offset;
|
||||
};
|
||||
|
||||
inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
|
||||
size_t needed_frame_size) {
|
||||
const auto count = (regs & ABI_ALL_GPRS).count();
|
||||
rsp_alignment -= count * 8;
|
||||
size_t subtraction = 0;
|
||||
const auto xmm_count = (regs & ABI_ALL_XMMS).count();
|
||||
if (xmm_count) {
|
||||
// If we have any XMMs to save, we must align the stack here.
|
||||
subtraction = rsp_alignment & 0xF;
|
||||
}
|
||||
subtraction += 0x10 * xmm_count;
|
||||
size_t xmm_base_subtraction = subtraction;
|
||||
subtraction += needed_frame_size;
|
||||
subtraction += ABI_SHADOW_SPACE;
|
||||
// Final alignment.
|
||||
rsp_alignment -= subtraction;
|
||||
subtraction += rsp_alignment & 0xF;
|
||||
|
||||
return ABIFrameInfo{static_cast<s32>(subtraction),
|
||||
static_cast<s32>(subtraction - xmm_base_subtraction)};
|
||||
}
|
||||
|
||||
inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.push(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (frame_info.subtraction != 0) {
|
||||
code.sub(code.rsp, frame_info.subtraction);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
|
||||
frame_info.xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
return ABI_SHADOW_SPACE;
|
||||
}
|
||||
|
||||
inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
|
||||
|
||||
for (size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
|
||||
frame_info.xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
if (frame_info.subtraction != 0) {
|
||||
code.add(code.rsp, frame_info.subtraction);
|
||||
}
|
||||
|
||||
// GPRs need to be popped in reverse order
|
||||
for (size_t j = 0; j < regs.size(); ++j) {
|
||||
const size_t i = regs.size() - j - 1;
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.pop(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
|
||||
@@ -1,46 +1,46 @@
|
||||
// SPDX-FileCopyrightText: 2016 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
#include <xbyak/xbyak.h>
|
||||
#include "common/x64/xbyak_abi.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
// Constants for use with cmpps/cmpss
|
||||
enum {
|
||||
CMP_EQ = 0,
|
||||
CMP_LT = 1,
|
||||
CMP_LE = 2,
|
||||
CMP_UNORD = 3,
|
||||
CMP_NEQ = 4,
|
||||
CMP_NLT = 5,
|
||||
CMP_NLE = 6,
|
||||
CMP_ORD = 7,
|
||||
};
|
||||
|
||||
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
|
||||
const u64 distance = target - (ref + 5);
|
||||
return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
|
||||
}
|
||||
|
||||
inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
|
||||
return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
|
||||
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
|
||||
size_t addr = reinterpret_cast<size_t>(f);
|
||||
if (IsWithin2G(code, addr)) {
|
||||
code.call(f);
|
||||
} else {
|
||||
// ABI_RETURN is a safe temp register to use before a call
|
||||
code.mov(ABI_RETURN, addr);
|
||||
code.call(ABI_RETURN);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
// SPDX-FileCopyrightText: 2016 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
#include <xbyak/xbyak.h>
|
||||
#include "common/x64/xbyak_abi.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
// Constants for use with cmpps/cmpss
|
||||
enum {
|
||||
CMP_EQ = 0,
|
||||
CMP_LT = 1,
|
||||
CMP_LE = 2,
|
||||
CMP_UNORD = 3,
|
||||
CMP_NEQ = 4,
|
||||
CMP_NLT = 5,
|
||||
CMP_NLE = 6,
|
||||
CMP_ORD = 7,
|
||||
};
|
||||
|
||||
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
|
||||
const u64 distance = target - (ref + 5);
|
||||
return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
|
||||
}
|
||||
|
||||
inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
|
||||
return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
|
||||
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
|
||||
size_t addr = reinterpret_cast<size_t>(f);
|
||||
if (IsWithin2G(code, addr)) {
|
||||
code.call(f);
|
||||
} else {
|
||||
// ABI_RETURN is a safe temp register to use before a call
|
||||
code.mov(ABI_RETURN, addr);
|
||||
code.call(ABI_RETURN);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
|
||||
Reference in New Issue
Block a user