4 , (vld1 <register>[<lane>], [<address]), q-, (vmovl), 16, 32 . - ( GNU)
vld1 d0[0], [<address>] @Now d0 = (*<addr>, *<addr+1>, *<addr+2>, *<addr+3>, <junk>, ... <junk> )
vmovl.u8 q0, d0 @Now q1 = (d0, d1) = ((uint16_t)*<addr>, ... (uint16_t)*<addr+3>, <junk>, ... <junk>)
vmovl.u16 q0, d2 @Now d0 = ((uint32_t)*<addr>, ... (uint32_t)*<addr+3>), d1 = (<junk>, ... <junk>)
, <address> 4 , [<address>: 32] , . , , .
Um, I just realized that you want to use intrinsics, not an assembly, so here it is the same with intrinsics.
uint32x4_t v8;
v8 = vld1_lane_u32(ptr, v8, 0);
const uint16x4_t v16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(v8)));
const uint32x4_t v32 = vmovl_u16(v16);
source
share