Why not use the standard library?
#include <bitset>
int bits_in(std::uint64_t u)
{
auto bs = std::bitset<64>(u);
return bs.count();
}
resulting assembler (Compiled with -O2 -march=native
):
bits_in(unsigned long):
xor eax, eax
popcnt rax, rdi
ret
It is worth mentioning at this point that not all x86 processors have this instruction so (at least with gcc) you will need to let it know what architecture to compile for.
@tambre mentioned that in reality, when it can, the optimiser will go further:
volatile int results[3];
int main()
{
results[0] = bits_in(255);
results[1] = bits_in(1023);
results[2] = bits_in(0x8000800080008000);
}
resulting assembler:
main:
mov DWORD PTR results[rip], 8
xor eax, eax
mov DWORD PTR results[rip+4], 10
mov DWORD PTR results[rip+8], 4
ret
Old-school bit-twiddlers like me need to find new problems to solve :)
Update
Not everyone was happy that the solution relies on cpu help to compute the bitcount. So what if we used an autogenerated table but allowed the developer to configure the size of it? (warning - long compile time for the 16-bit table version)
#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <bitset>
template<std::size_t word_size, std::size_t...Is>
constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) {
struct popcount_type {
constexpr auto operator()(int i) const {
int bits = 0;
while (i) {
i &= i - 1;
++bits;
}
return bits;
}
};
constexpr auto popcnt = popcount_type();
return std::array<int, sizeof...(Is)>
{
{popcnt(Is)...}
};
}
template<class T>
constexpr auto power2(T x) {
T result = 1;
for (T i = 0; i < x; ++i)
result *= 2;
return result;
}
template<class TableWord>
struct table {
static constexpr auto word_size = std::numeric_limits<TableWord>::digits;
static constexpr auto table_length = power2(word_size);
using array_type = std::array<int, table_length>;
static const array_type& get_data() {
static const array_type data = generate(std::integral_constant<std::size_t, word_size>(),
std::make_index_sequence<table_length>());
return data;
};
};
template<class Word>
struct use_table_word {
};
template<class Word, class TableWord = std::uint8_t>
int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) {
constexpr auto table_word_size = std::numeric_limits<TableWord>::digits;
constexpr auto word_size = std::numeric_limits<Word>::digits;
constexpr auto times = word_size / table_word_size;
static_assert(times > 0, "incompatible");
auto reduce = [val](auto times) {
return (val >> (table_word_size * times)) & (power2(table_word_size) - 1);
};
auto const& data = table<TableWord>::get_data();
auto result = 0;
for (int i = 0; i < times; ++i) {
result += data[reduce(i)];
}
return result;
}
volatile int results[3];
#include <iostream>
int main() {
auto input = std::uint64_t(1023);
results[0] = bits_in(input);
results[0] = bits_in(input, use_table_word<std::uint16_t>());
results[1] = bits_in(0x8000800080008000);
results[2] = bits_in(34567890);
for (int i = 0; i < 3; ++i) {
std::cout << results[i] << std::endl;
}
return 0;
}
Final Update
This version allows the use of any number of bits in the lookup table and supports any input type, even if it's smaller than the number of bits in the lookup table.
It also short-circuits if the high bits are zero.
#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <algorithm>
namespace detail {
template<std::size_t bits, typename = void>
struct smallest_word;
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits <= 8)>>
{
using type = std::uint8_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>>
{
using type = std::uint16_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>>
{
using type = std::uint32_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>>
{
using type = std::uint64_t;
};
}
template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type;
template<class WordType, std::size_t bits, std::size_t...Is>
constexpr auto generate(std::index_sequence<Is...>) {
using word_type = WordType;
struct popcount_type {
constexpr auto operator()(word_type i) const {
int result = 0;
while (i) {
i &= i - 1;
++result;
}
return result;
}
};
constexpr auto popcnt = popcount_type();
return std::array<word_type, sizeof...(Is)>
{
{popcnt(Is)...}
};
}
template<class T>
constexpr auto power2(T x) {
return T(1) << x;
}
template<std::size_t word_size>
struct table {
static constexpr auto table_length = power2(word_size);
using word_type = smallest_word<word_size>;
using array_type = std::array<word_type, table_length>;
static const array_type& get_data() {
static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>());
return data;
};
template<class Type, std::size_t bits>
static constexpr auto n_bits() {
auto result = Type();
auto b = bits;
while(b--) {
result = (result << 1) | Type(1);
}
return result;
};
template<class Uint>
int operator()(Uint i) const {
constexpr auto mask = n_bits<Uint, word_size>();
return get_data()[i & mask];
}
};
template<int bits>
struct use_bits {
static constexpr auto digits = bits;
};
template<class T>
constexpr auto minimum(T x, T y) { return x < y ? x : y; }
template<class Word, class UseBits = use_bits<8>>
int bits_in(Word val, UseBits = UseBits()) {
using word_type = std::make_unsigned_t<Word>;
auto uval = static_cast<word_type>(val);
constexpr auto table_word_size = UseBits::digits;
constexpr auto word_size = std::numeric_limits<word_type>::digits;
auto const& mytable = table<table_word_size>();
int result = 0;
while (uval)
{
result += mytable(uval);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-count-overflow"
uval >>= minimum(table_word_size, word_size);
#pragma clang diagnostic pop
}
return result;
}
volatile int results[4];
#include <iostream>
int main() {
auto input = std::uint8_t(127);
results[0] = bits_in(input);
results[1] = bits_in(input, use_bits<4>());
results[2] = bits_in(input, use_bits<11>());
results[3] = bits_in(input, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
auto input2 = 0xabcdef;
results[0] = bits_in(input2);
results[1] = bits_in(input2, use_bits<4>());
results[2] = bits_in(input2, use_bits<11>());
results[3] = bits_in(input2, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
auto input3 = -1;
results[0] = bits_in(input3);
results[1] = bits_in(input3, use_bits<4>());
results[2] = bits_in(input3, use_bits<11>());
results[3] = bits_in(input3, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
return 0;
}
example output:
7
7
7
7
17
17
17
17
32
32
32
32
The resulting assembly output for a call to bits_in(int, use_bits<11>())
for example, becomes:
.L16:
mov edx, edi
and edx, 2047
movzx edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx]
add eax, edx
shr edi, 11
jne .L16
Which seems reasonable to me.