2014/01/18(土)x86 で cntlz64
Counting of Leading Zeros.
Number of Leading Zeros. ともいう。
64bitのnlzをx86で求めるにはどうすりゃええんか? と思っていろいろいじってた。
ptn.2 は、__asmがかえって最適化を阻害するので、結局ptn.3の形に戻る。
ptn.3 は、どうも、わざわざスタック上に変数retを作って代入(そのまま使われない)しやがるのでやや無駄っぽいが、
それを無理に削ろうとして、ptn.4までいくと、inline化できなくなっちゃう。
結論、悩まず素直に書け?
ptn.5は分岐を後にもってきたバージョン。
無駄命令は減ってる気がするけど、やや遅くなった。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// BitScanForward.cpp | |
// compile with: /EHsc | |
#include <Windows.h> | |
#include <iostream> | |
#include <intrin.h> | |
static inline unsigned long bsr64_1(unsigned long long x) | |
{ | |
// ptn.1 -> 20171ms | |
struct lf { static inline unsigned long bsr(unsigned long x) { __asm { bsr eax, x } } }; | |
return x ? x >> 32 ? 31 - lf::bsr(x >> 32) : 63 - lf::bsr(x) : 64; | |
} | |
static inline unsigned long bsr64_2(unsigned long long x) | |
{ | |
#pragma warning(push) | |
#pragma warning(disable: 4035) | |
// ptn.2 -> 21719ms | |
struct X { unsigned long b, a; }; | |
X& xx = *reinterpret_cast<X*>(&x); | |
if (xx.a) { __asm bsr eax, xx.a __asm not eax __asm and eax, 31 } | |
else if (xx.b) { __asm bsr eax, xx.b __asm not eax __asm and eax, 63 } | |
else return 64; | |
#pragma warning(pop) | |
} | |
static inline unsigned long bsr64_3(unsigned long long x) | |
{ | |
// ptn.3 -> 19781ms | |
register unsigned long a = x >> 32; | |
register unsigned long b = x; | |
if (a) | |
{ | |
register unsigned long ret; | |
_BitScanReverse(&ret, a); | |
return ~ret & 31; | |
} | |
else if (b) | |
{ | |
register unsigned long ret; | |
_BitScanReverse(&ret, b); | |
return ~ret & 63; | |
} | |
else | |
{ | |
return 64; | |
} | |
} | |
// ptn.4 -> 23531ms | |
_declspec(naked) static long bsr64_4(unsigned long long) | |
{ | |
__asm | |
{ | |
push ebp; | |
mov ebp, esp; | |
mov eax, DWORD PTR[ebp + 4]; | |
test eax, eax; | |
je SHORT $LOWER32; | |
bsr eax, eax; | |
not eax; | |
and eax, 31; | |
pop ebp; | |
ret 0; | |
$LOWER32: | |
mov eax, DWORD PTR[ebp]; | |
test eax, eax; | |
je SHORT $ZERO; | |
bsr eax, eax; | |
not eax; | |
and eax, 63; | |
pop ebp; | |
ret 0; | |
$ZERO: | |
mov eax, 64; | |
pop ebp; | |
ret 0; | |
} | |
} | |
static inline unsigned long bsr64_5(unsigned long long x) | |
{ | |
// ptn.5-> 20187ms | |
register unsigned long ret; | |
return _BitScanReverse(&ret, x >> 32) | |
? ~ret & 31 | |
: _BitScanReverse(&ret, x) | |
? ~ret & 63 | |
: 64; | |
#if 0 // 生成物 | |
push ebp | |
mov ebp, esp | |
bsr eax, DWORD PTR _x$[ebp + 4] | |
je SHORT $LN5@bsr64_5 | |
not eax | |
and eax, 31 | |
pop ebp | |
ret 0 | |
$LN5@bsr64_5: | |
bsr eax, DWORD PTR _x$[ebp] | |
je SHORT $LN3@bsr64_5 | |
not eax | |
and eax, 63 | |
pop ebp | |
ret 0 | |
$LN3@bsr64_5: | |
mov eax, 64 | |
pop ebp | |
ret 0 | |
#endif | |
} | |
static int GetNumberOfLeadingZeros(unsigned long long n) | |
{ | |
// ptn.x -> 19504ms * 10 | |
n = n | (n >> 1); | |
n = n | (n >> 2); | |
n = n | (n >> 4); | |
n = n | (n >> 8); | |
n = n | (n >> 16); | |
n = n | (n >> 32); | |
n = ~n; | |
n = (n & 0x5555555555555555) + ((n >> 1) & 0x5555555555555555); | |
n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333); | |
n = (n & 0x0F0F0F0F0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F0F0F0F0F); | |
n = (n & 0x00FF00FF00FF00FF) + ((n >> 8) & 0x00FF00FF00FF00FF); | |
n = (n & 0x0000FFFF0000FFFF) + ((n >> 16) & 0x0000FFFF0000FFFF); | |
n = (n & 0x00000000FFFFFFFF) + ((n >> 32) & 0x00000000FFFFFFFF); | |
return n; | |
} | |
using namespace std; | |
int main() | |
{ | |
unsigned long long mask = 0x1000; | |
unsigned long index; | |
unsigned char isNonzero; | |
Sleep(1000); | |
DWORD x = GetTickCount(); | |
#define F(i) bsr64_5(i) | |
volatile int a = 0; | |
for (int i = 0; i < 1e9; i++) | |
{ | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
a = F(i); | |
} | |
cout << (GetTickCount() - x) << endl; | |
} |