Templating inline asm in Rust

luzero

Luca Barbato

Posted on September 5, 2022

Templating inline asm in Rust

Kostya wrote about it and he asked me to figure out if what's he is missing isn't already available one way or another.

Before we start

For those not used to Multimedia, Rust or Assembly

  • Multimedia is all about processing data and serving/rendering it to the user at the right moment.
  • That requires to have a good control over latency and use the least amount of cpu.
  • This leads to use architecture specific extensions such as x86_64 AVX2 or ARM NEON.
  • Higher level languages such as C (or Rust) offer access to those extension via intrinsics, but quite often they are cumbersome enough that writing assembly as-is ends up being more pleasant.
  • You may look dav1d and rav1e for examples.

Rust and assembly

Rust had a fairly weak point in supporting assembly:

  • rustc does not compile .s or .S as gcc and clang do, and that makes you rely on cc-rs or nasm-rs
  • rustc until recently did not have a stable support for inline-assembly, and right now it has some useful parts still in nightly.

How far are we?

Kostya tried the current stable and he managed to write some assembly for his h264 decoder and get a 20% gain, but he had 3 issues:

  • He couldn't figure out how to manage sub-registers and the compiler warning wasn't that helpful for him since it assumes you know the format! terminology and you know asm! relates to it. this is being addressed
  • asm! operands in stable do not include sym and const.
  • Kostya couldn't figure out how to deal with templating the assembly using macro_rules!() as he is used to do with gcc.

I asked on zulip since I couldn't think of better ways than munching tokens and usually this means I'm missing something much simpler and obvious.
Luckily Amanieu helped us and this blog post is more or less about keeping notes.

Use cases for inline asm templating

In multimedia software you often write tiny kernels that operate over blocks of pixels, 4x4, 8x8, 16x16 and so on and usually the same inner logic is shared across and you ideally would like to not repeat yourself, even more if the very same logic can be shared across the many many extensions x86 has.

Kostya used this as example

fn avg_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
    unsafe {
        asm!(
            "2:",
            "movd   xmm1, [{src}]",
            "movd   xmm3, [{src} + {sstride}]",
            "movd   xmm0, [{dst}]",
            "movd   xmm2, [{dst} + {dstride}]",
            "lea    {src}, [{src} + {sstride} * 2]",
            "pavgb  xmm0, xmm1",
            "pavgb  xmm2, xmm3",
            "movd   [{dst}], xmm0",
            "movd   [{dst} + {dstride}], xmm2",
            "lea    {dst}, [{dst} + {dstride} * 2]",
            "sub    {h}, 2",
            "jnz    2b",
            src = inout(reg) src.as_ptr() => _,
            sstride = in(reg) sstride,
            dst = inout(reg) dst.as_mut_ptr() => _,
            dstride = in(reg) dstride,
            h = inout(reg) bh => _,
            out("xmm0") _,
            out("xmm1") _,
            out("xmm2") _,
            out("xmm3") _,
        );
    }
}

fn avg_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
    unsafe {
        asm!(
            "2:",
            "movq   xmm0, [{src}]",
            "movq   xmm1, [{src} + {sstride}]",
            "movq   xmm2, [{dst}]",
            "movq   xmm3, [{dst} + {dstride}]",
            "lea    {src}, [{src} + {sstride} * 2]",
            "pavgb  xmm0, xmm2",
            "pavgb  xmm1, xmm3",
            "movq   [{dst}], xmm0",
            "movq   [{dst} + {dstride}], xmm1",
            "lea    {dst}, [{dst} + {dstride} * 2]",
            "sub    {h}, 2",
            "jnz    2b",
            src = inout(reg) src.as_ptr() => _,
            sstride = in(reg) sstride,
            dst = inout(reg) dst.as_mut_ptr() => _,
            dstride = in(reg) dstride,
            h = inout(reg) bh => _,
            out("xmm0") _,
            out("xmm1") _,
            out("xmm2") _,
            out("xmm3") _,
        );
    }
}

fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
    unsafe {
        asm!(
            "2:",
            "movaps xmm0, [{src}]",
            "movaps xmm1, [{src} + {sstride}]",
            "pavgb  xmm0, [{dst}]",
            "pavgb  xmm1, [{dst} + {dstride}]",
            "lea    {src}, [{src} + {sstride} * 2]",
            "movq   [{dst}], xmm0",
            "movq   [{dst} + {dstride}], xmm1",
            "lea    {dst}, [{dst} + {dstride} * 2]",
            "sub    {h}, 2",
            "jnz    2b",
            src = inout(reg) src.as_ptr() => _,
            sstride = in(reg) sstride,
            dst = inout(reg) dst.as_mut_ptr() => _,
            dstride = in(reg) dstride,
            h = inout(reg) bh => _,
            out("xmm0") _,
            out("xmm1") _,
        );
    }
}
Enter fullscreen mode Exit fullscreen mode

Between avg_4 and avg_8 there is just a movd vs movq and for this Amalieu suggested to use the concat! pattern he uses in corosensei.

avg_4 and avg_8 would end up being

macro_rules! avg {
    ($name: ident, $mov:literal) => {
        fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
            unsafe {
                asm!(
                    "2:",
                    concat!($mov, " xmm1, [{src}]"),
                    concat!($mov, " xmm3, [{src} + {sstride}]"),
                    concat!($mov, " xmm0, [{dst}]"),
                    concat!($mov, " xmm2, [{dst} + {dstride}]"),
                    "lea    {src}, [{src} + {sstride} * 2]",
                    "pavgb  xmm0, xmm1",
                    "pavgb  xmm2, xmm3",
                    concat!($mov, " [{dst}], xmm0"),
                    concat!($mov, " [{dst} + {dstride}], xmm2"),
                    "lea    {dst}, [{dst} + {dstride} * 2]",
                    "sub    {h}, 2",
                    "jnz    2b",
                    src = inout(reg) src.as_ptr() => _,
                    sstride = in(reg) sstride,
                    dst = inout(reg) dst.as_mut_ptr() => _,
                    dstride = in(reg) dstride,
                    h = inout(reg) bh => _,
                    out("xmm0") _,
                    out("xmm1") _,
                    out("xmm2") _,
                    out("xmm3") _,
                );
            }
        }
    }
}

avg!{avg_4};
avg!{avg_8};
Enter fullscreen mode Exit fullscreen mode

To factorize away avg_8 and avg_16, you need a way to deal with the operands and while the asm statements are literals, the operands can be only expressed as tt in a macro_rules!.

The normal way to deal with this use-case is to rely on the preprocessor #if directives, using macro_rules you have to be a bit more creative.

Amalieu gave me another example and I ended up crafting:

macro_rules! avg_common {
    ($name:ident { $($load:literal),* } { $($store:literal),* } $($out:tt)*) => {
        fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
            unsafe {
                asm!(
                    "2:",
                    $($load),*,
                    "lea    {src}, [{src} + {sstride} * 2]",
                    "pavgb  xmm0, xmm2",
                    "pavgb  xmm1, xmm3",
                    $($store),*,
                    "lea    {dst}, [{dst} + {dstride} * 2]",
                    "sub    {h}, 2",
                    "jnz    2b",
                    src = inout(reg) src.as_ptr() => _,
                    sstride = in(reg) sstride,
                    dst = inout(reg) dst.as_mut_ptr() => _,
                    dstride = in(reg) dstride,
                    h = inout(reg) bh => _,
                    $($out)*
                )
            }
        }
    }
}

macro_rules! avg {
    (avg_8) => {
        avg_common!{avg_8 {
                "movq   xmm0, [{src}]",
                "movq   xmm1, [{src} + {sstride}]",
                "movq   xmm2, [{dst}]",
                "movq   xmm3, [{dst} + {dstride}]"
            }
            {
                "movq   [{dst}], xmm0",
                "movq   [{dst} + {dstride}], xmm1"
            }
            out("xmm0") _,
            out("xmm1") _,
            out("xmm2") _,
            out("xmm3") _,
        }
    };
    (avg_16) => {
        avg_common!{avg_16 {
                "movaps xmm0, [{src}]",
                "movaps xmm1, [{src} + {sstride}]",
                "pavgb  xmm0, [{dst}]",
                "pavgb  xmm1, [{dst} + {dstride}]"
            }
            {
                "movq   [{dst}], xmm0",
                "movq   [{dst} + {dstride}], xmm1"
            }
            out("xmm0") _,
            out("xmm1") _,
        }
    };
}
Enter fullscreen mode Exit fullscreen mode

More or less solving Kostya problem even if supporting multiple blocks of operands would require extra care.

Coming next

I had been quite busy with the SIFIS-Home project in particular writing a new implementation of WebOfThings in Rust, soon we'll release the first version supporting wot-1.1 and probably I'll write a bit about it.

💖 💪 🙅 🚩
luzero
Luca Barbato

Posted on September 5, 2022

Join Our Newsletter. No Spam, Only the good stuff.

Sign up to receive the latest update from our blog.

Related