What is a time-efficient algorithm for copying asymmetric bitmaps?

I had to do this many times in the past, and I was never happy with the results.

Can anyone suggest a quick way to copy a continuous array of bits from a source to a destination, where both the source and destination objects cannot be aligned (shifted to the right) at convenient processor boundaries?

If both sources and destinations are not aligned, the problem can be quickly changed to one where only one of them is not aligned (after the first copy).

As a starting point, my code inevitably ends up looking like this (untested, ignore side effects, it's just from the cuff example):

const char mask[8] = { 1, 3, 7, 15, 31, 63, 127, 255 };
/* Assume:
 * - destination is already zeroed,
 * - offsets are right shifts
 * - bits to copy is big (> 32 say)
 */
int bitarray_copy(char * src, int src_bit_offset, int src_bit_len,
                  char * dst, int dst_bit_offset) {
    if (src_bit_offset == dst_bit_offset) { /* Not very interesting */ 
    } else {
        int bit_diff_offset = src_bit_offset - dst_bit_offset; /* assume positive */
        int loop_count;
        char c;
        char mask_val = mask[bit_diff_offset];

        /* Get started, line up the destination. */
        c  = (*src++ << bit_diff_offset) | ((*src >> (8 - bit_diff_offset)) & mask_val);
        c &= mask[8-dst_bit_offset];

        *dst++ |= c;

        src_bit_len -= 8 - dst_bit_offset;
        loop_count = src_bit_len >> 3;

        while (--loop_count >= 0) 
            * dst ++ = (*src++ << bit_diff_offset) | ((*src >> (8 - bit_diff_offset)) & mask_val);

        /* Trailing tail copy etc ... */
        if (src_bit_len % 8) /* ... */
    }
}

(actually it’s better than I did before. It doesn’t look so bad)

+5
4

. . - :

  • . , . , , .
  • , . .
  • , , . , .
+1

, . ( 8/21/2014 .)

#include <limits.h>
#include <string.h>
#include <stddef.h>

#define PREPARE_FIRST_COPY()                                      \
    do {                                                          \
    if (src_len >= (CHAR_BIT - dst_offset_modulo)) {              \
        *dst     &= reverse_mask[dst_offset_modulo];              \
        src_len -= CHAR_BIT - dst_offset_modulo;                  \
    } else {                                                      \
        *dst     &= reverse_mask[dst_offset_modulo]               \
              | reverse_mask_xor[dst_offset_modulo + src_len];    \
         c       &= reverse_mask[dst_offset_modulo + src_len];    \
        src_len = 0;                                              \
    } } while (0)


static void
bitarray_copy(const unsigned char *src_org, int src_offset, int src_len,
                    unsigned char *dst_org, int dst_offset)
{
    static const unsigned char mask[] =
        { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
    static const unsigned char reverse_mask[] =
        { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
    static const unsigned char reverse_mask_xor[] =
        { 0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 };

    if (src_len) {
        const unsigned char *src;
              unsigned char *dst;
        int                  src_offset_modulo,
                             dst_offset_modulo;

        src = src_org + (src_offset / CHAR_BIT);
        dst = dst_org + (dst_offset / CHAR_BIT);

        src_offset_modulo = src_offset % CHAR_BIT;
        dst_offset_modulo = dst_offset % CHAR_BIT;

        if (src_offset_modulo == dst_offset_modulo) {
            int              byte_len;
            int              src_len_modulo;
            if (src_offset_modulo) {
                unsigned char   c;

                c = reverse_mask_xor[dst_offset_modulo]     & *src++;

                PREPARE_FIRST_COPY();
                *dst++ |= c;
            }

            byte_len = src_len / CHAR_BIT;
            src_len_modulo = src_len % CHAR_BIT;

            if (byte_len) {
                memcpy(dst, src, byte_len);
                src += byte_len;
                dst += byte_len;
            }
            if (src_len_modulo) {
                *dst     &= reverse_mask_xor[src_len_modulo];
                *dst |= reverse_mask[src_len_modulo]     & *src;
            }
        } else {
            int             bit_diff_ls,
                            bit_diff_rs;
            int             byte_len;
            int             src_len_modulo;
            unsigned char   c;
            /*
             * Begin: Line things up on destination. 
             */
            if (src_offset_modulo > dst_offset_modulo) {
                bit_diff_ls = src_offset_modulo - dst_offset_modulo;
                bit_diff_rs = CHAR_BIT - bit_diff_ls;

                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                c     &= reverse_mask_xor[dst_offset_modulo];
            } else {
                bit_diff_rs = dst_offset_modulo - src_offset_modulo;
                bit_diff_ls = CHAR_BIT - bit_diff_rs;

                c = *src >> bit_diff_rs     &
                    reverse_mask_xor[dst_offset_modulo];
            }
            PREPARE_FIRST_COPY();
            *dst++ |= c;

            /*
             * Middle: copy with only shifting the source. 
             */
            byte_len = src_len / CHAR_BIT;

            while (--byte_len >= 0) {
                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                *dst++ = c;
            }

            /*
             * End: copy the remaing bits; 
             */
            src_len_modulo = src_len % CHAR_BIT;
            if (src_len_modulo) {
                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                c     &= reverse_mask[src_len_modulo];

                *dst     &= reverse_mask_xor[src_len_modulo];
                *dst |= c;
            }
        }
    }
}
+6

. - , n , n < 3, ( PIC18, 8x- 11 ). ( src2 , )

  src1 = *src++;
  src2 = (src1 shl shiftamount1) | (src2 shr shiftamount2);
  *dest++ = src2;
  src2 = *src++;
  src1 = (src2 shl shiftamount1) | (src1 shr shiftamount2);
  *dest++ = src1;

ARM ( , src, dest, src1, src2, shiftamount1 shiftamount2. / . - ( , , , ):

  src0 = *src++;
  src1 = *src++;
  src2 = *src++;
  src3 = *src++;
  tmp  = src0;
  src0 = src0 shr shiftamount1
  src0 = src0 | src1 shl shiftamount2
  src1 = src1 shr shiftamount1
  src1 = src1 | src2 shl shiftamount2
  src2 = src2 shr shiftamount1
  src2 = src2 | src3 shl shiftamount2
  src3 = src3 shr shiftamount1
  src3 = src3 | tmp shl shiftamount2
  * dest ++ = src0;
  * dest ++ = src1;
  * dest ++ = src2;
  * dest ++ = src3;

Eleven 16-byte instructions are rotated.

+1
source

Your solution is similar to what I saw: basically do some jagged work at the beginning and at the end, with the main loop in the middle, using aligned calls. If you really need efficiency and do it on very long bitstreams, I would suggest using something like the SSE2 architecture in the main loop.

0
source

All Articles