Accelerate base64 encode using AVX2

Carrot 2021-05-22

Sort of an excersise..

The four LUTs for converting numeric values to ASCII chars is ugly. Besides, since both _mm256_shuffle_epi8 and _mm256_blendv_epi8 suffers from low throughput (1 CPI / 2 CPI on most Intel platform, respectively), it can be a bottleneck.

Note that:

It assumes [input, input + 30) is valid for reading,
You need to handle smaller blocks (smaller than 24 bytes) using other (e.g., the “classical” one) algorithm.

// Encodes 24B binary into 32B base64-encodeded.
void Encode(const char* input, char* output) {
  auto low = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input));
  auto high = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(input + 12));

  // Now we have (`x` means garbage.):
  //
  // xxxxBA9876543210  xxxxBA9876543210
  //
  // Note that bytes from lower address is the least significant digit (this is
  // how little endian works).
  auto data = _mm256_setr_m128i(low, high);

  // We need convert it into:
  //
  // x012 x345 x678 x9AB  x012 x345 x678 x9AB
  //
  // Note the endian conversion here. We do need big endian here because of how
  // base64 works: we need lower bits of byte in lower address to be adjacent to
  // higher bits of higher byte.
  const auto kIndicesForGrouping = _mm256_setr_epi8(
      2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1,  // -1 sets to zero.
      2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
  auto four_byte_aligned = _mm256_shuffle_epi8(data, kIndicesForGrouping);

  // Now for each 4 byte part (with the least 24 bits containing meaningful
  // value), we'd like to rearrange them so that the first meaningful 12 bits
  // are in the higher 2-byte, and the second 12 bits in the rest 2-byte.
  //
  // We ignore the garbage in bits we're not interested in for now.
  auto t1 = _mm256_slli_epi32(four_byte_aligned, 16 - 12);
  auto two_byte_aligned =
      _mm256_blend_epi16(four_byte_aligned, t1, 0b1010'1010);

  // Unfortunately there's no `_mm256_blend_epi8` for us so that we can do the
  // same trick to make a one-byte-aligned vector.
  //
  // So this time we do simple bit-tricks to make the vector we want by:
  //
  // - Moving every 16-bit left by 2 bits, saving the result to a temporary
  //   variable `t2`.
  // - Masking off uninteresting bits from `t2` and `two_byte_aligned`.
  // - And `or`-ing them together.
  auto t2 = _mm256_slli_epi16(two_byte_aligned, 8 - 6);
  const auto kMaskForTwoByteAligned = _mm256_set1_epi16(0b0000'0000'0011'1111);
  const auto kMaskForTemporaryVector = _mm256_set1_epi16(0b0011'1111'0000'0000);
  auto t3 = _mm256_and_si256(t2, kMaskForTemporaryVector);
  auto t4 = _mm256_and_si256(two_byte_aligned, kMaskForTwoByteAligned);
  auto one_byte_aligned = _mm256_or_si256(t3, t4);

  // Now get the endianness right (see comments on `kIndicesForGrouping` for
  // more details).
  const auto kToLittleEndian = _mm256_setr_epi8(
      3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,  // ...
      3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
  auto values = _mm256_shuffle_epi8(one_byte_aligned, kToLittleEndian);

  // LUT for value in range [0, 16), [16, 32), [32, 48), [48, 64), respectively.
  const auto kLookupTable0 =
      _mm256_setr_epi8('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
                       'L', 'M', 'N', 'O', 'P', 'A', 'B', 'C', 'D', 'E', 'F',
                       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P');
  const auto kLookupTable1 =
      _mm256_setr_epi8('Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
                       'b', 'c', 'd', 'e', 'f', 'Q', 'R', 'S', 'T', 'U', 'V',
                       'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f');
  const auto kLookupTable2 =
      _mm256_setr_epi8('g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
                       'r', 's', 't', 'u', 'v', 'g', 'h', 'i', 'j', 'k', 'l',
                       'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v');
  const auto kLookupTable3 =
      _mm256_setr_epi8('w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6',
                       '7', '8', '9', '+', '/', 'w', 'x', 'y', 'z', '0', '1',
                       '2', '3', '4', '5', '6', '7', '8', '9', '+', '/');

  // Now tests each byte and use the right LUT to convert it.
  const auto kLut01Threshold = _mm256_set1_epi8(16);
  const auto kLut23Threshold = _mm256_set1_epi8(48);
  const auto kLutThreshold = _mm256_set1_epi8(32);
  auto comp1 = _mm256_cmpgt_epi8(kLut01Threshold, values);
  auto t5 = _mm256_shuffle_epi8(kLookupTable0, values);
  auto t6 = _mm256_shuffle_epi8(kLookupTable1, values);
  auto t7 = _mm256_blendv_epi8(t6, t5, comp1);
  auto comp2 = _mm256_cmpgt_epi8(kLut23Threshold, values);
  auto t8 = _mm256_shuffle_epi8(kLookupTable2, values);
  auto t9 = _mm256_shuffle_epi8(kLookupTable3, values);
  auto t10 = _mm256_blendv_epi8(t9, t8, comp2);
  auto comp = _mm256_cmpgt_epi8(kLutThreshold, values);
  auto result = _mm256_blendv_epi8(t10, t7, comp);

  _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(output), result);
}

Yet Another Needless Site

Programming

Accelerate base64 encode using AVX2

Leave a Reply Cancel reply