using arm neon inline assembly and using it only if in 32 bits (#1201)
using arm neon inline assembly
This commit is contained in:
parent
7e0a2dbcbb
commit
e0d7729f87
|
@ -37,10 +37,6 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#if TORRENT_HAS_ARM_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace libtorrent
|
||||
{
|
||||
bool bitfield::all_set() const
|
||||
|
@ -85,18 +81,20 @@ namespace libtorrent
|
|||
}
|
||||
#endif // TORRENT_HAS_SSE
|
||||
|
||||
#if TORRENT_HAS_ARM_NEON
|
||||
#if TORRENT_HAS_ARM_NEON && defined __arm__
|
||||
if (aux::arm_neon_support)
|
||||
{
|
||||
for (int i = 1; i < words + 1; ++i)
|
||||
{
|
||||
uint8x8_t const in_val = vld1_u8(reinterpret_cast<unsigned char*>(&m_buf[i]));
|
||||
uint8x8_t const cnt8x8_val = vcnt_u8(in_val);
|
||||
uint16x4_t const cnt16x4_val = vpaddl_u8(cnt8x8_val);
|
||||
uint32x2_t const cnt32x2_val = vpaddl_u16(cnt16x4_val);
|
||||
uint32_t cnt;
|
||||
vst1_u32(&cnt, cnt32x2_val);
|
||||
|
||||
std::uint32_t cnt;
|
||||
__asm__(
|
||||
"vld1.u32 d0[0], [%1] \n"
|
||||
"vcnt.u8 d0, d0 \n"
|
||||
"vpaddl.u8 d0, d0 \n"
|
||||
"vpaddl.u16 d0, d0 \n"
|
||||
"vst1.u32 d0[0], [%0]"
|
||||
:: "r"(&cnt), "r"(&m_buf[i])
|
||||
: "d0", "memory");
|
||||
ret += cnt;
|
||||
}
|
||||
|
||||
|
|
|
@ -101,7 +101,9 @@ namespace libtorrent { namespace aux
|
|||
return (getauxval(16) & (1 << 12));
|
||||
#elif defined __aarch64__
|
||||
//return (getauxval(AT_HWCAP) & HWCAP_ASIMD);
|
||||
return (getauxval(16) & (1 << 1));
|
||||
//return (getauxval(16) & (1 << 1));
|
||||
// TODO: enable when aarch64 is really tested
|
||||
return false;
|
||||
#endif
|
||||
#else
|
||||
return false;
|
||||
|
|
Loading…
Reference in New Issue