Accelerate base64 encode using AVX2Sort of an excersise..
The four LUTs for converting numeric values to ASCII chars is ugly. Besides, since both _mm256_shuffle_epi8
and _mm256_blendv_epi8
suffers from low throughput (1 CPI / 2 CPI on most Intel platform, respectively), it can be a bottleneck.
Note that:
- It assumes [
input
, input
+ 30) is valid for reading, - You need to handle smaller blocks (smaller than 24 bytes) using other (e.g., the “classical” one) algorithm.
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 // Encodes 24B binary into 32B base64-encodeded.void Encode(const char* input, char* output) { auto low = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input)); auto high = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input + 12)); // Now we have (`x` means garbage.): // // xxxxBA9876543210 xxxxBA9876543210 // // Note that bytes from lower address is the least significant digit (this is // how little endian works). auto data = _mm256_setr_m128i(low, high); // We need convert it into: // // x012 x345 x678 x9AB x012 x345 x678 x9AB // // Note the endian conversion here. We do need big endian here because of how // base64 works: we need lower bits of byte in lower address to be adjacent to // higher bits of higher byte. const auto kIndicesForGrouping = _mm256_setr_epi8( 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1, // -1 sets to zero. 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); auto four_byte_aligned = _mm256_shuffle_epi8(data, kIndicesForGrouping); // Now for each 4 byte part (with the least 24 bits containing meaningful // value), we'd like to rearrange them so that the first meaningful 12 bits // are in the higher 2-byte, and the second 12 bits in the rest 2-byte. // // We ignore the garbage in bits we're not interested in for now. auto t1 = _mm256_slli_epi32(four_byte_aligned, 16 - 12); auto two_byte_aligned = _mm256_blend_epi16(four_byte_aligned, t1, 0b1010'1010); // Unfortunately there's no `_mm256_blend_epi8` for us so that we can do the // same trick to make a one-byte-aligned vector. // // So this time we do simple bit-tricks to make the vector we want by: // // - Moving every 16-bit left by 2 bits, saving the result to a temporary // variable `t2`. // - Masking off uninteresting bits from `t2` and `two_byte_aligned`. // - And `or`-ing them together. auto t2 = _mm256_slli_epi16(two_byte_aligned, 8 - 6); const auto kMaskForTwoByteAligned = _mm256_set1_epi16(0b0000'0000'0011'1111); const auto kMaskForTemporaryVector = _mm256_set1_epi16(0b0011'1111'0000'0000); auto t3 = _mm256_and_si256(t2, kMaskForTemporaryVector); auto t4 = _mm256_and_si256(two_byte_aligned, kMaskForTwoByteAligned); auto one_byte_aligned = _mm256_or_si256(t3, t4); // Now get the endianness right (see comments on `kIndicesForGrouping` for // more details). const auto kToLittleEndian = _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // ... 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); auto values = _mm256_shuffle_epi8(one_byte_aligned, kToLittleEndian); // LUT for value in range [0, 16), [16, 32), [32, 48), [48, 64), respectively. const auto kLookupTable0 = _mm256_setr_epi8('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'); const auto kLookupTable1 = _mm256_setr_epi8('Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f'); const auto kLookupTable2 = _mm256_setr_epi8('g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v'); const auto kLookupTable3 = _mm256_setr_epi8('w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'); // Now tests each byte and use the right LUT to convert it. const auto kLut01Threshold = _mm256_set1_epi8(16); const auto kLut23Threshold = _mm256_set1_epi8(48); const auto kLutThreshold = _mm256_set1_epi8(32); auto comp1 = _mm256_cmpgt_epi8(kLut01Threshold, values); auto t5 = _mm256_shuffle_epi8(kLookupTable0, values); auto t6 = _mm256_shuffle_epi8(kLookupTable1, values); auto t7 = _mm256_blendv_epi8(t6, t5, comp1); auto comp2 = _mm256_cmpgt_epi8(kLut23Threshold, values); auto t8 = _mm256_shuffle_epi8(kLookupTable2, values); auto t9 = _mm256_shuffle_epi8(kLookupTable3, values); auto t10 = _mm256_blendv_epi8(t9, t8, comp2); auto comp = _mm256_cmpgt_epi8(kLutThreshold, values); auto result = _mm256_blendv_epi8(t10, t7, comp); _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(output), result);}
Sort of an excersise..
The four LUTs for converting numeric values to ASCII chars is ugly. Besides, since both _mm256_shuffle_epi8
and _mm256_blendv_epi8
suffers from low throughput (1 CPI / 2 CPI on most Intel platform, respectively), it can be a bottleneck.
Note that:
- It assumes [
input
,input
+ 30) is valid for reading, - You need to handle smaller blocks (smaller than 24 bytes) using other (e.g., the “classical” one) algorithm.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | // Encodes 24B binary into 32B base64-encodeded. void Encode(const char* input, char* output) { auto low = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input)); auto high = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input + 12)); // Now we have (`x` means garbage.): // // xxxxBA9876543210 xxxxBA9876543210 // // Note that bytes from lower address is the least significant digit (this is // how little endian works). auto data = _mm256_setr_m128i(low, high); // We need convert it into: // // x012 x345 x678 x9AB x012 x345 x678 x9AB // // Note the endian conversion here. We do need big endian here because of how // base64 works: we need lower bits of byte in lower address to be adjacent to // higher bits of higher byte. const auto kIndicesForGrouping = _mm256_setr_epi8( 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1, // -1 sets to zero. 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); auto four_byte_aligned = _mm256_shuffle_epi8(data, kIndicesForGrouping); // Now for each 4 byte part (with the least 24 bits containing meaningful // value), we'd like to rearrange them so that the first meaningful 12 bits // are in the higher 2-byte, and the second 12 bits in the rest 2-byte. // // We ignore the garbage in bits we're not interested in for now. auto t1 = _mm256_slli_epi32(four_byte_aligned, 16 - 12); auto two_byte_aligned = _mm256_blend_epi16(four_byte_aligned, t1, 0b1010'1010); // Unfortunately there's no `_mm256_blend_epi8` for us so that we can do the // same trick to make a one-byte-aligned vector. // // So this time we do simple bit-tricks to make the vector we want by: // // - Moving every 16-bit left by 2 bits, saving the result to a temporary // variable `t2`. // - Masking off uninteresting bits from `t2` and `two_byte_aligned`. // - And `or`-ing them together. auto t2 = _mm256_slli_epi16(two_byte_aligned, 8 - 6); const auto kMaskForTwoByteAligned = _mm256_set1_epi16(0b0000'0000'0011'1111); const auto kMaskForTemporaryVector = _mm256_set1_epi16(0b0011'1111'0000'0000); auto t3 = _mm256_and_si256(t2, kMaskForTemporaryVector); auto t4 = _mm256_and_si256(two_byte_aligned, kMaskForTwoByteAligned); auto one_byte_aligned = _mm256_or_si256(t3, t4); // Now get the endianness right (see comments on `kIndicesForGrouping` for // more details). const auto kToLittleEndian = _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // ... 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); auto values = _mm256_shuffle_epi8(one_byte_aligned, kToLittleEndian); // LUT for value in range [0, 16), [16, 32), [32, 48), [48, 64), respectively. const auto kLookupTable0 = _mm256_setr_epi8('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'); const auto kLookupTable1 = _mm256_setr_epi8('Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f'); const auto kLookupTable2 = _mm256_setr_epi8('g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v'); const auto kLookupTable3 = _mm256_setr_epi8('w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'); // Now tests each byte and use the right LUT to convert it. const auto kLut01Threshold = _mm256_set1_epi8(16); const auto kLut23Threshold = _mm256_set1_epi8(48); const auto kLutThreshold = _mm256_set1_epi8(32); auto comp1 = _mm256_cmpgt_epi8(kLut01Threshold, values); auto t5 = _mm256_shuffle_epi8(kLookupTable0, values); auto t6 = _mm256_shuffle_epi8(kLookupTable1, values); auto t7 = _mm256_blendv_epi8(t6, t5, comp1); auto comp2 = _mm256_cmpgt_epi8(kLut23Threshold, values); auto t8 = _mm256_shuffle_epi8(kLookupTable2, values); auto t9 = _mm256_shuffle_epi8(kLookupTable3, values); auto t10 = _mm256_blendv_epi8(t9, t8, comp2); auto comp = _mm256_cmpgt_epi8(kLutThreshold, values); auto result = _mm256_blendv_epi8(t10, t7, comp); _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(output), result); } |