Do not use __umul128() intrinsic with MSVC, as it is not faster at all + some code clean-up.

This commit is contained in:
LoRd_MuldeR 2021-04-09 01:00:29 +02:00
parent 3cdfc9d9ab
commit d76d0f6e4b
Signed by: mulder
GPG Key ID: 2B5913365F57E03F

View File

@ -12,12 +12,6 @@
#include <limits.h>
#include <assert.h>
/* Intrinsic */
#if defined(_MSC_VER) && defined(_M_X64)
# include <intrin.h>
# pragma intrinsic(_umul128)
#endif
/* Compiler compatibility */
#if defined(_MSC_VER)
# define FORCE_INLINE __forceinline
@ -103,17 +97,14 @@ static FORCE_INLINE uint8_t byte_u64(const uint64_t value, const size_t off)
// 128-Bit math support
// ==========================================================================
static FORCE_INLINE void mult_u128(uint128_t *const out, const uint128_t lhs, const uint128_t rhs)
#define READ_U128(X) ((((__uint128_t)(X).hi) << 64U) | ((__uint128_t)(X).lo))
static FORCE_INLINE void multiply_u128(uint128_t *const out, const uint128_t lhs, const uint128_t rhs)
{
#if defined(__GNUC__) && defined(__SIZEOF_INT128__)
const __uint128_t lhs_128 = ((__uint128_t)lhs.hi << 64U) | lhs.lo;
const __uint128_t rhs_128 = ((__uint128_t)rhs.hi << 64U) | rhs.lo;
const __uint128_t out_128 = lhs_128 * rhs_128;
out->hi = (uint64_t)(out_128 >> 64U);
out->lo = (uint64_t)(out_128 & 0xFFFFFFFFFFFFFFFF);
#else
#if defined(_MSC_VER) && defined(_M_X64)
out->lo = _umul128(lhs.lo, rhs.lo, &out->hi);
const __uint128_t tmp = READ_U128(lhs) * READ_U128(rhs);
out->hi = (uint64_t)(tmp >> 64U);
out->lo = (uint64_t)(tmp & 0xFFFFFFFFFFFFFFFF);
#else
const uint64_t lolo = (lhs.lo & 0xFFFFFFFF) * (rhs.lo & 0xFFFFFFFF);
const uint64_t hilo = (lhs.lo >> 32U) * (rhs.lo & 0xFFFFFFFF);
@ -122,7 +113,6 @@ static FORCE_INLINE void mult_u128(uint128_t *const out, const uint128_t lhs, co
const uint64_t crss = (lolo >> 32U) + (hilo & 0xFFFFFFFF) + lohi;
out->hi = (hilo >> 32U) + (crss >> 32) + hihi;
out->lo = (crss << 32U) | (lolo & 0xFFFFFFFF);
#endif
out->hi += (lhs.hi * rhs.lo) + (lhs.lo * rhs.hi); /* 128x128=128 */
#endif
}
@ -140,7 +130,7 @@ static FORCE_INLINE void hash_update_str(uint128_t *const hash, const uint8_t *c
for (i = 0U; i < data_len; ++i)
{
hash->lo ^= data[i];
mult_u128(hash, *hash, HASH_MAGIC_PRIME);
multiply_u128(hash, *hash, HASH_MAGIC_PRIME);
}
}
@ -150,7 +140,7 @@ static FORCE_INLINE void hash_update_u64(uint128_t *const hash, const uint64_t v
for (i = 0U; i < sizeof(uint64_t); ++i)
{
hash->lo ^= byte_u64(value, i);
mult_u128(hash, *hash, HASH_MAGIC_PRIME);
multiply_u128(hash, *hash, HASH_MAGIC_PRIME);
}
}
@ -179,9 +169,9 @@ static FORCE_INLINE uint64_t keygen_loop(uint64_t salt, const uint8_t *const pas
static void generate_key(uint64_t *const key, const uint64_t salt, const uint16_t pepper, const uint8_t *const passwd, const size_t passwd_len)
{
key[0U] = keygen_loop(0x243F6A8885A308D3 + salt + pepper, passwd, passwd_len);
key[1U] = keygen_loop(0x13198A2E03707344 + salt + pepper, passwd, passwd_len);
key[2U] = keygen_loop(0xA4093822299F31D0 + salt + pepper, passwd, passwd_len);
key[0U] = keygen_loop(0x162603FA1CDA99D3 + salt + pepper, passwd, passwd_len);
key[1U] = keygen_loop(0xBFDEC4A6C1A46E09 + salt + pepper, passwd, passwd_len);
key[2U] = keygen_loop(0x6BA17D11624973EE + salt + pepper, passwd, passwd_len);
}
// ==========================================================================