What is a better method for packing 4 bytes into 3 than this?

Question

What is a better method for packing 4 bytes into 3 than this?

I have an array of values well within the range of 0-63, and I decided to pack every 4 bytes in 3, because the values only require 6 bits, and I could use an extra 2 bits to store the first two bits of the next value, etc. .

I never did this before I used the switch and the nextbit variable (a device with a state machine like a device) to do the packing and keep track of the start bit. I am convinced, however, that there must be a better way.

Suggestions / tips, please, but do not ruin my pleasure; -)

What are the portability issues regarding big / small endian?

btw: I checked that this code works by unpacking it again and comparing it with the input. And no, this is not homework, just an exercise that I set for myself.

 /* build with gcc -std=c99 -Wconversion */ #define ASZ 400 typedef unsigned char uc_; uc_ data[ASZ]; int i; for (i = 0; i < ASZ; ++i) { data[i] = (uc_)(i % 0x40); } size_t dl = sizeof(data); printf("sizeof(data):%z\n",dl); float fpl = ((float)dl / 4.0f) * 3.0f; size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl); printf("length of packed data:%z\n",pl); for (i = 0; i < dl; ++i) printf("%02d ", data[i]); printf("\n"); uc_ * packeddata = calloc(pl, sizeof(uc_)); uc_ * byte = packeddata; uc_ nextbit = 1; for (int i = 0; i < dl; ++i) { uc_ m = (uc_)(data[i] & 0x3f); switch(nextbit) { case 1: /* all 6 bits of m into first 6 bits of byte: */ *byte = m; nextbit = 7; break; case 3: /* all 6 bits of m into last 6 bits of byte: */ *byte++ = (uc_)(*byte | (m << 2)); nextbit = 1; break; case 5: /* 1st 4 bits of m into last 4 bits of byte: */ *byte++ = (uc_)(*byte | ((m & 0x0f) << 4)); /* 5th and 6th bits of m into 1st and 2nd bits of byte: */ *byte = (uc_)(*byte | ((m & 0x30) >> 4)); nextbit = 3; break; case 7: /* 1st 2 bits of m into last 2 bits of byte: */ *byte++ = (uc_)(*byte | ((m & 0x03) << 6)); /* next (last) 4 bits of m into 1st 4 bits of byte: */ *byte = (uc_)((m & 0x3c) >> 2); nextbit = 5; break; } }

+4

c arrays bit-manipulation bit-packing

James morris Nov 03 '09 at 0:30

source share

5 answers

Check out IETF RFC 4648 for “Base16, Base32, and Base64 Data Codes.”

Partial Code Criticism:

 size_t dl = sizeof(data); printf("sizeof(data):%d\n",dl); float fpl = ((float)dl / 4.0f) * 3.0f; size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl); printf("length of packed data:%d\n",pl);

Do not use floating point things - just use integers. And use "% z" to print the "size_t" values - if you have a C99 library.

 size_t pl = ((dl + 3) / 4) * 3;

I think your loop could be simplified by dealing with 3-byte input modules until you have a partial unit left, and then deal with the remainder of 1 or 2 bytes as special cases. I note that the standard mention says that you use one or two "=" signs to fill in at the end.

I have a Base64 encoder and decoding that does some of them. You describe the “decoded” part of Base64, where the Base64 code has 4 bytes of data, which should be stored in only 3 - as a packaging code. The Base64 encoder matches the unpacker you need.

Base-64 Decoder

Note: base_64_inv is an array of 256 values, one for each possible input byte value; it determines the correct decoded value for each encoded byte. Base64 encoded is a sparse array - 3/4 zeros. Similarly, base_64_map is a mapping between the value 0..63 and the corresponding storage value.

 enum { DC_PAD = -1, DC_ERR = -2 }; static int decode_b64(int c) { int b64 = base_64_inv[c]; if (c == base64_pad) b64 = DC_PAD; else if (b64 == 0 && c != base_64_map[0]) b64 = DC_ERR; return(b64); } /* Decode 4 bytes into 3 */ static int decode_quad(const char *b64_data, char *bin_data) { int b0 = decode_b64(b64_data[0]); int b1 = decode_b64(b64_data[1]); int b2 = decode_b64(b64_data[2]); int b3 = decode_b64(b64_data[3]); int bytes; if (b0 < 0 || b1 < 0 || b2 == DC_ERR || b3 == DC_ERR || (b2 == DC_PAD && b3 != DC_PAD)) return(B64_ERR_INVALID_ENCODED_DATA); if (b2 == DC_PAD && (b1 & 0x0F) != 0) /* 3rd byte is '='; 2nd byte must end with 4 zero bits */ return(B64_ERR_INVALID_TRAILING_BYTE); if (b2 >= 0 && b3 == DC_PAD && (b2 & 0x03) != 0) /* 4th byte is '='; 3rd byte is not '=' and must end with 2 zero bits */ return(B64_ERR_INVALID_TRAILING_BYTE); bin_data[0] = (b0 << 2) | (b1 >> 4); bytes = 1; if (b2 >= 0) { bin_data[1] = ((b1 & 0x0F) << 4) | (b2 >> 2); bytes = 2; } if (b3 >= 0) { bin_data[2] = ((b2 & 0x03) << 6) | (b3); bytes = 3; } return(bytes); } /* Decode input Base-64 string to original data. Output length returned, or negative error */ int base64_decode(const char *data, size_t datalen, char *buffer, size_t buflen) { size_t outlen = 0; if (datalen % 4 != 0) return(B64_ERR_INVALID_ENCODED_LENGTH); if (BASE64_DECLENGTH(datalen) > buflen) return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL); while (datalen >= 4) { int nbytes = decode_quad(data, buffer + outlen); if (nbytes < 0) return(nbytes); outlen += nbytes; data += 4; datalen -= 4; } assert(datalen == 0); /* By virtue of the %4 check earlier */ return(outlen); }

Base-64 Encoder

 /* Encode 3 bytes of data into 4 */ static void encode_triplet(const char *triplet, char *quad) { quad[0] = base_64_map[(triplet[0] >> 2) & 0x3F]; quad[1] = base_64_map[((triplet[0] & 0x03) << 4) | ((triplet[1] >> 4) & 0x0F)]; quad[2] = base_64_map[((triplet[1] & 0x0F) << 2) | ((triplet[2] >> 6) & 0x03)]; quad[3] = base_64_map[triplet[2] & 0x3F]; } /* Encode 2 bytes of data into 4 */ static void encode_doublet(const char *doublet, char *quad, char pad) { quad[0] = base_64_map[(doublet[0] >> 2) & 0x3F]; quad[1] = base_64_map[((doublet[0] & 0x03) << 4) | ((doublet[1] >> 4) & 0x0F)]; quad[2] = base_64_map[((doublet[1] & 0x0F) << 2)]; quad[3] = pad; } /* Encode 1 byte of data into 4 */ static void encode_singlet(const char *singlet, char *quad, char pad) { quad[0] = base_64_map[(singlet[0] >> 2) & 0x3F]; quad[1] = base_64_map[((singlet[0] & 0x03) << 4)]; quad[2] = pad; quad[3] = pad; } /* Encode input data as Base-64 string. Output length returned, or negative error */ static int base64_encode_internal(const char *data, size_t datalen, char *buffer, size_t buflen, char pad) { size_t outlen = BASE64_ENCLENGTH(datalen); const char *bin_data = (const void *)data; char *b64_data = (void *)buffer; if (outlen > buflen) return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL); while (datalen >= 3) { encode_triplet(bin_data, b64_data); bin_data += 3; b64_data += 4; datalen -= 3; } b64_data[0] = '\0'; if (datalen == 2) encode_doublet(bin_data, b64_data, pad); else if (datalen == 1) encode_singlet(bin_data, b64_data, pad); b64_data[4] = '\0'; return((b64_data - buffer) + strlen(b64_data)); }

I complicate my life by dealing with a product that uses the alphabet variant for Base64 encoding, and also does not allow processing data - therefore, the argument is "pad" (which can be zero for "zero fill" or "= 'for the standard complement. Array base_64_map contains the alphabet to use for 6-bit values in the range 0..63.

+4

Jonathan leffler Nov 03 '09 at 12:38

source share

Another easy way to do this is to use bit fields. One of the lesser-known corners of C struct syntax is a large field. Let's say you have the following structure:

 struct packed_bytes { byte chunk1 : 6; byte chunk2 : 6; byte chunk3 : 6; byte chunk4 : 6; };

This declares chunk1 , chunk2 , chunk3 and chunk4 to be of type byte , with only 6 bits in the structure. As a result, we get sizeof(struct packed_bytes) == 3 . Now you need a small function to take your array and unload it into the structure as follows:

 void dump_to_struct(byte *in, struct packed_bytes *out, int count) { int i, j; for (i = 0; i < (count / 4); ++i) { out[i].chunk1 = in[i * 4]; out[i].chunk2 = in[i * 4 + 1]; out[i].chunk3 = in[i * 4 + 2]; out[i].chunk4 = in[i * 4 + 3]; } // Finish up switch(struct % 4) { case 3: out[count / 4].chunk3 = in[(count / 4) * 4 + 2]; case 2: out[count / 4].chunk2 = in[(count / 4) * 4 + 1]; case 1: out[count / 4].chunk1 = in[(count / 4) * 4]; } }

Here you are, now you have an array of struct packed_bytes , which can be easily read using the above structure.

+1

Jason e Nov 03 '09 at 1:16

source share

Instead of using statemachine, you can simply use a counter to see how many bits are already in use in the current byte, from which you can directly infer shift offsets and whether you overflow the next byte. As for endianess: as long as you use only one data type (i.e. you do not reinterpret the pointer to types of different sizes (for example, int* a =...;short* b=(short*) a; ) you should not have problems with endianess in most cases

0

Grizzly Nov 03 '09 at 12:47

source share

Taking in the elements of DigitalRoss compact code, Grizzly's suggestions, and my own code, I finally wrote my own answer. Although DigitalRoss provides a useful working answer, my use of it without understanding would not provide the same satisfaction as learning something. For this reason, I decided to base my answer on my source code.

I also preferred to ignore the advice Jonathan Leffler gives to avoid using floating point arithmetic to calculate the length of the packed data. Both recommended methods - the same DigitalRoss also uses, increases the length of the packed data by as much as three bytes. Of course, this is not so much, but also can be avoided with the help of floating point mathematics.

Here is the code, criticism is welcome:

 /* built with gcc -std=c99 */ #include <stdio.h> #include <stdlib.h> #include <string.h> unsigned char * pack(const unsigned char * data, size_t len, size_t * packedlen) { float fpl = ((float)len / 4.0f) * 3.0f; *packedlen = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl); unsigned char * packed = malloc(*packedlen); if (!packed) return 0; const unsigned char * in = data; const unsigned char * in_end = in + len; unsigned char * out; for (out = packed; in + 4 <= in_end; in += 4) { *out++ = in[0] | ((in[1] & 0x03) << 6); *out++ = ((in[1] & 0x3c) >> 2) | ((in[2] & 0x0f) << 4); *out++ = ((in[2] & 0x30) >> 4) | (in[3] << 2); } size_t lastlen = in_end - in; if (lastlen > 0) { *out = in[0]; if (lastlen > 1) { *out++ |= ((in[1] & 0x03) << 6); *out = ((in[1] & 0x3c) >> 2); if (lastlen > 2) { *out++ |= ((in[2] & 0x0f) << 4); *out = ((in[2] & 0x30) >> 4); if (lastlen > 3) *out |= (in[3] << 2); } } } return packed; } int main() { size_t i; unsigned char data[] = { 12, 15, 40, 18, 26, 32, 50, 3, 7, 19, 46, 10, 25, 37, 2, 39, 60, 59, 0, 17, 9, 29, 13, 54, 5, 6, 47, 32 }; size_t datalen = sizeof(data); printf("unpacked datalen: %td\nunpacked data\n", datalen); for (i = 0; i < datalen; ++i) printf("%02d ", data[i]); printf("\n"); size_t packedlen; unsigned char * packed = pack(data, sizeof(data), &packedlen); if (!packed) { fprintf(stderr, "Packing failed!\n"); return EXIT_FAILURE; } printf("packedlen: %td\npacked data\n", packedlen); for (i = 0; i < packedlen; ++i) printf("0x%02x ", packed[i]); printf("\n"); free(packed); return EXIT_SUCCESS; }

0

James morris Nov 08 '09 at 1:09

source share

Digitaloss · Accepted Answer · 2009-11-03T01:26:24+0000

So this looks like `code-golf` , right?

 #include <stdlib.h> #include <string.h> static void pack2(unsigned char *r, unsigned char *n) { unsigned v = n[0] + (n[1] << 6) + (n[2] << 12) + (n[3] << 18); *r++ = v; *r++ = v >> 8; *r++ = v >> 16; } unsigned char *apack(const unsigned char *s, int len) { unsigned char *s_end = s + len, *r, *result = malloc(len/4*3+3), lastones[4] = { 0 }; if (result == NULL) return NULL; for(r = result; s + 4 <= s_end; s += 4, r += 3) pack2(r, s); memcpy(lastones, s, s_end - s); pack2(r, lastones); return result; }

What is a better method for packing 4 bytes into 3 than this?

So this looks like code-golf , right?

Base-64 Decoder

Base-64 Encoder

More articles:

So this looks like `code-golf` , right?