Accelerate base64 encode using AVX2Sort of an excersise..
The four LUTs for converting numeric values to ASCII chars is ugly. Besides, since both _mm256_shuffle_epi8
and _mm256_blendv_epi8
suffers from low throughput (1 CPI / 2 CPI on most Intel platform, respectively), it can be a bottleneck.
Note that:
- It assumes [
input
, input
+ 30) is valid for reading, - You need to handle smaller blocks (smaller than 24 bytes) using other (e.g., the “classical” one) algorithm.
// Encodes 24B binary into 32B base64-encodeded.
void Encode(const char* input, char* output) {
auto low = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input));
auto high = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input + 12));
// Now we have (`x` means garbage.):
//
// xxxxBA9876543210 xxxxBA9876543210
//
// Note that bytes from lower address is the least significant digit (this is
// how little endian works).
auto data = _mm256_setr_m128i(low, high);
// We need convert it into:
//
// x012 x345 x678 x9AB x012 x345 x678 x9AB
//
// Note the endian conversion here. We do need big endian here because of how
// base64 works: we need lower bits of byte in lower address to be adjacent to
// higher bits of higher byte.
const auto kIndicesForGrouping = _mm256_setr_epi8(
2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1, // -1 sets to zero.
2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
auto four_byte_aligned = _mm256_shuffle_epi8(data, kIndicesForGrouping);
// Now for each 4 byte part (with the least 24 bits containing meaningful
// value), we'd like to rearrange them so that the first meaningful 12 bits
// are in the higher 2-byte, and the second 12 bits in the rest 2-byte.
//
// We ignore the garbage in bits we're not interested in for now.
auto t1 = _mm256_slli_epi32(four_byte_aligned, 16 - 12);
auto two_byte_aligned =
_mm256_blend_epi16(four_byte_aligned, t1, 0b1010'1010);
// Unfortunately there's no `_mm256_blend_epi8` for us so that we can do the
// same trick to make a one-byte-aligned vector.
//
// So this time we do simple bit-tricks to make the vector we want by:
//
// - Moving every 16-bit left by 2 bits, saving the result to a temporary
// variable `t2`.
// - Masking off uninteresting bits from `t2` and `two_byte_aligned`.
// - And `or`-ing them together.
auto t2 = _mm256_slli_epi16(two_byte_aligned, 8 - 6);
const auto kMaskForTwoByteAligned = _mm256_set1_epi16(0b0000'0000'0011'1111);
const auto kMaskForTemporaryVector = _mm256_set1_epi16(0b0011'1111'0000'0000);
auto t3 = _mm256_and_si256(t2, kMaskForTemporaryVector);
auto t4 = _mm256_and_si256(two_byte_aligned, kMaskForTwoByteAligned);
auto one_byte_aligned = _mm256_or_si256(t3, t4);
// Now get the endianness right (see comments on `kIndicesForGrouping` for
// more details).
const auto kToLittleEndian = _mm256_setr_epi8(
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // ...
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
auto values = _mm256_shuffle_epi8(one_byte_aligned, kToLittleEndian);
// LUT for value in range [0, 16), [16, 32), [32, 48), [48, 64), respectively.
const auto kLookupTable0 =
_mm256_setr_epi8('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P');
const auto kLookupTable1 =
_mm256_setr_epi8('Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
'b', 'c', 'd', 'e', 'f', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f');
const auto kLookupTable2 =
_mm256_setr_epi8('g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v');
const auto kLookupTable3 =
_mm256_setr_epi8('w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6',
'7', '8', '9', '+', '/', 'w', 'x', 'y', 'z', '0', '1',
'2', '3', '4', '5', '6', '7', '8', '9', '+', '/');
// Now tests each byte and use the right LUT to convert it.
const auto kLut01Threshold = _mm256_set1_epi8(16);
const auto kLut23Threshold = _mm256_set1_epi8(48);
const auto kLutThreshold = _mm256_set1_epi8(32);
auto comp1 = _mm256_cmpgt_epi8(kLut01Threshold, values);
auto t5 = _mm256_shuffle_epi8(kLookupTable0, values);
auto t6 = _mm256_shuffle_epi8(kLookupTable1, values);
auto t7 = _mm256_blendv_epi8(t6, t5, comp1);
auto comp2 = _mm256_cmpgt_epi8(kLut23Threshold, values);
auto t8 = _mm256_shuffle_epi8(kLookupTable2, values);
auto t9 = _mm256_shuffle_epi8(kLookupTable3, values);
auto t10 = _mm256_blendv_epi8(t9, t8, comp2);
auto comp = _mm256_cmpgt_epi8(kLutThreshold, values);
auto result = _mm256_blendv_epi8(t10, t7, comp);
_mm256_storeu_si256(reinterpret_cast<__m256i_u*>(output), result);
}
Sort of an excersise..
The four LUTs for converting numeric values to ASCII chars is ugly. Besides, since both _mm256_shuffle_epi8
and _mm256_blendv_epi8
suffers from low throughput (1 CPI / 2 CPI on most Intel platform, respectively), it can be a bottleneck.
Note that:
- It assumes [
input
,input
+ 30) is valid for reading, - You need to handle smaller blocks (smaller than 24 bytes) using other (e.g., the “classical” one) algorithm.
// Encodes 24B binary into 32B base64-encodeded.
void Encode(const char* input, char* output) {
auto low = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input));
auto high = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input + 12));
// Now we have (`x` means garbage.):
//
// xxxxBA9876543210 xxxxBA9876543210
//
// Note that bytes from lower address is the least significant digit (this is
// how little endian works).
auto data = _mm256_setr_m128i(low, high);
// We need convert it into:
//
// x012 x345 x678 x9AB x012 x345 x678 x9AB
//
// Note the endian conversion here. We do need big endian here because of how
// base64 works: we need lower bits of byte in lower address to be adjacent to
// higher bits of higher byte.
const auto kIndicesForGrouping = _mm256_setr_epi8(
2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1, // -1 sets to zero.
2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
auto four_byte_aligned = _mm256_shuffle_epi8(data, kIndicesForGrouping);
// Now for each 4 byte part (with the least 24 bits containing meaningful
// value), we'd like to rearrange them so that the first meaningful 12 bits
// are in the higher 2-byte, and the second 12 bits in the rest 2-byte.
//
// We ignore the garbage in bits we're not interested in for now.
auto t1 = _mm256_slli_epi32(four_byte_aligned, 16 - 12);
auto two_byte_aligned =
_mm256_blend_epi16(four_byte_aligned, t1, 0b1010'1010);
// Unfortunately there's no `_mm256_blend_epi8` for us so that we can do the
// same trick to make a one-byte-aligned vector.
//
// So this time we do simple bit-tricks to make the vector we want by:
//
// - Moving every 16-bit left by 2 bits, saving the result to a temporary
// variable `t2`.
// - Masking off uninteresting bits from `t2` and `two_byte_aligned`.
// - And `or`-ing them together.
auto t2 = _mm256_slli_epi16(two_byte_aligned, 8 - 6);
const auto kMaskForTwoByteAligned = _mm256_set1_epi16(0b0000'0000'0011'1111);
const auto kMaskForTemporaryVector = _mm256_set1_epi16(0b0011'1111'0000'0000);
auto t3 = _mm256_and_si256(t2, kMaskForTemporaryVector);
auto t4 = _mm256_and_si256(two_byte_aligned, kMaskForTwoByteAligned);
auto one_byte_aligned = _mm256_or_si256(t3, t4);
// Now get the endianness right (see comments on `kIndicesForGrouping` for
// more details).
const auto kToLittleEndian = _mm256_setr_epi8(
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // ...
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
auto values = _mm256_shuffle_epi8(one_byte_aligned, kToLittleEndian);
// LUT for value in range [0, 16), [16, 32), [32, 48), [48, 64), respectively.
const auto kLookupTable0 =
_mm256_setr_epi8('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P');
const auto kLookupTable1 =
_mm256_setr_epi8('Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
'b', 'c', 'd', 'e', 'f', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f');
const auto kLookupTable2 =
_mm256_setr_epi8('g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v');
const auto kLookupTable3 =
_mm256_setr_epi8('w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6',
'7', '8', '9', '+', '/', 'w', 'x', 'y', 'z', '0', '1',
'2', '3', '4', '5', '6', '7', '8', '9', '+', '/');
// Now tests each byte and use the right LUT to convert it.
const auto kLut01Threshold = _mm256_set1_epi8(16);
const auto kLut23Threshold = _mm256_set1_epi8(48);
const auto kLutThreshold = _mm256_set1_epi8(32);
auto comp1 = _mm256_cmpgt_epi8(kLut01Threshold, values);
auto t5 = _mm256_shuffle_epi8(kLookupTable0, values);
auto t6 = _mm256_shuffle_epi8(kLookupTable1, values);
auto t7 = _mm256_blendv_epi8(t6, t5, comp1);
auto comp2 = _mm256_cmpgt_epi8(kLut23Threshold, values);
auto t8 = _mm256_shuffle_epi8(kLookupTable2, values);
auto t9 = _mm256_shuffle_epi8(kLookupTable3, values);
auto t10 = _mm256_blendv_epi8(t9, t8, comp2);
auto comp = _mm256_cmpgt_epi8(kLutThreshold, values);
auto result = _mm256_blendv_epi8(t10, t7, comp);
_mm256_storeu_si256(reinterpret_cast<__m256i_u*>(output), result);
}