128 bit integer on cuda?

For best performance, one would want to map the 128-bit type on top of a suitable CUDA vector type, such as uint4, and implement the functionality using PTX inline assembly. The addition would look something like this:

typedef uint4 my_uint128_t;
__device__ my_uint128_t add_uint128 (my_uint128_t addend, my_uint128_t augend)
{
    my_uint128_t res;
    asm ("add.cc.u32      %0, %4, %8;\n\t"
         "addc.cc.u32     %1, %5, %9;\n\t"
         "addc.cc.u32     %2, %6, %10;\n\t"
         "addc.u32        %3, %7, %11;\n\t"
         : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
         : "r"(addend.x), "r"(addend.y), "r"(addend.z), "r"(addend.w),
           "r"(augend.x), "r"(augend.y), "r"(augend.z), "r"(augend.w));
    return res;
}

The multiplication can similarly be constructed using PTX inline assembly by breaking the 128-bit numbers into 32-bit chunks, computing the 64-bit partial products and adding them appropriately. Obviously this takes a bit of work. One might get reasonable performance at the C level by breaking the number into 64-bit chunks and using __umul64hi() in conjuction with regular 64-bit multiplication and some additions. This would result in the following:

__device__ my_uint128_t mul_uint128 (my_uint128_t multiplicand, 
                                     my_uint128_t multiplier)
{
    my_uint128_t res;
    unsigned long long ahi, alo, bhi, blo, phi, plo;
    alo = ((unsigned long long)multiplicand.y << 32) | multiplicand.x;
    ahi = ((unsigned long long)multiplicand.w << 32) | multiplicand.z;
    blo = ((unsigned long long)multiplier.y << 32) | multiplier.x;
    bhi = ((unsigned long long)multiplier.w << 32) | multiplier.z;
    plo = alo * blo;
    phi = __umul64hi (alo, blo) + alo * bhi + ahi * blo;
    res.x = (unsigned int)(plo & 0xffffffff);
    res.y = (unsigned int)(plo >> 32);
    res.z = (unsigned int)(phi & 0xffffffff);
    res.w = (unsigned int)(phi >> 32);
    return res;
}

Below is a version of the 128-bit multiplication that uses PTX inline assembly. It requires PTX 3.0, which shipped with CUDA 4.2, and the code requires a GPU with at least compute capability 2.0, i.e. a Fermi or Kepler class device. The code uses the minimal number of instructions, as sixteen 32-bit multiplies are needed to implement a 128-bit multiplication. By comparison, the variant above using CUDA intrinsics compiles to 23 instructions for an sm_20 target.

__device__ my_uint128_t mul_uint128 (my_uint128_t a, my_uint128_t b)
{
    my_uint128_t res;
    asm ("{\n\t"
         "mul.lo.u32      %0, %4, %8;    \n\t"
         "mul.hi.u32      %1, %4, %8;    \n\t"
         "mad.lo.cc.u32   %1, %4, %9, %1;\n\t"
         "madc.hi.u32     %2, %4, %9,  0;\n\t"
         "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"
         "madc.hi.cc.u32  %2, %5, %8, %2;\n\t"
         "madc.hi.u32     %3, %4,%10,  0;\n\t"
         "mad.lo.cc.u32   %2, %4,%10, %2;\n\t"
         "madc.hi.u32     %3, %5, %9, %3;\n\t"
         "mad.lo.cc.u32   %2, %5, %9, %2;\n\t"
         "madc.hi.u32     %3, %6, %8, %3;\n\t"
         "mad.lo.cc.u32   %2, %6, %8, %2;\n\t"
         "madc.lo.u32     %3, %4,%11, %3;\n\t"
         "mad.lo.u32      %3, %5,%10, %3;\n\t"
         "mad.lo.u32      %3, %6, %9, %3;\n\t"
         "mad.lo.u32      %3, %7, %8, %3;\n\t"
         "}"
         : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
         : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w),
           "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
    return res;
}

Leave a Comment