Cool Zig Patterns - Comptime String Interning

#comptime #patterns #snippets

Comptime is cool, but you might have noticed that it has some quirks. One of these quirks is the property that each referenced pointer will make it into the final binary.
Also if you compute strings and they repeat, each string receives its own unique memory address.

Consider this example where we collect all capitalized words from Lorem Ipsum:

const std = @import("std");

const input = @embedFile("input.txt");

export const capitalized = blk: {
    @setEvalBranchQuota(100_000);

    var result: []const []const u8 = &.{};

    var iter = std.mem.tokenize(u8, input, ",.\r\n ");
    while (iter.next()) |word| {
        if (std.ascii.isUpper(word[0])) {
            result = result ++ [1][]const u8{word};
        }
    }

    break :blk result;
};

The input.txt is several kilobytes large, and as we reference it via the tokenizer, it gets put as a whole into the final binary. This is not optimal, as we are only interested in the capitalized words only, the rest is garbage.

I figured a pretty easy way to intern strings such that they will only be exactly once in the final executable, no matter how many source memory regions we have. This is done by converting the string into an array, then return a reference to that array:

fn internString(comptime str: []const u8) []const u8 {
    return internStringBuffer(str.len, str[0..str.len].*);
}

fn internStringBuffer(comptime len: comptime_int, comptime items: [len]u8) []const u8 {
    comptime var storage: [len]u8 = items;
    return &storage;
}

This uses the fact that comptime calls are memoized, and as we pass only concrete values to internStringBuffer, it will only get called once and for subsequent calls will always return the same memory reference.

So we can adjust our example like this:

const std = @import("std");

const input = @embedFile("input.txt");

export const capitalized = blk: {
    @setEvalBranchQuota(100_000);

    var result: []const []const u8 = &.{};

    var iter = std.mem.tokenize(u8, input, ",.\r\n ");
    while (iter.next()) |word| {
        if (std.ascii.isUpper(word[0])) {
            result = result ++ [1][]const u8{
                internString(word),
            };
        }
    }

    break :blk result;
};

fn internString(comptime str: []const u8) []const u8 {
    return internStringBuffer(str.len, str[0..str.len].*);
}

fn internStringBuffer(comptime len: comptime_int, comptime items: [len]u8) []const u8 {
    comptime var storage: [len]u8 = items;
    return &storage;
}

But did it work as expected? Let's check!

First, let's compile both examples into a shared object, then dump the .rodata section (which is where our strings are put):

zig build-lib -dynamic -O ReleaseSmall -fno-compiler-rt bad.zig
zig build-lib -dynamic -O ReleaseSmall -fno-compiler-rt good.zig

objdump libbad.so -s -j .rodata > bad.dump
objdump libgood.so -s -j .rodata > good.dump

If we check the sizes of our shared objects, libgood.so is roughly 4 kB smaller than libbad.so, so it seems like we've made it work.

Checking both dumps, we can verify that it worked:

libbad.so:     file format elf64-x86-64
Contents of section .rodata:
 0c90 4c6f7265 6d206970 73756d20 646f6c6f  Lorem ipsum dolo
 0ca0 72207369 7420616d 65742c20 636f6e73  r sit amet, cons
 0cb0 65637465 74756572 20616469 70697363  ectetuer adipisc
 0cc0 696e6720 656c6974 2e204165 6e65616e  ing elit. Aenean
 0cd0 20636f6d 6d6f646f 206c6967 756c6120   commodo ligula 
 0ce0 65676574 20646f6c 6f722e20 41656e65  eget dolor. Aene
 0cf0 616e206d 61737361 2e204375 6d20736f  an massa. Cum so
 0d00 63696973 206e6174 6f717565 2070656e  ciis natoque pen
 0d10 61746962 75732065 74206d61 676e6973  atibus et magnis
 0d20 20646973 20706172 74757269 656e7420   dis parturient 
 0d30 6d6f6e74 65732c20 6e617363 65747572  montes, nascetur
 0d40 20726964 6963756c 7573206d 75732e20   ridiculus mus. 
 0d50 446f6e65 63207175 616d2066 656c6973  Donec quam felis
 0d60 2c20756c 74726963 69657320 6e65632c  , ultricies nec,
 0d70 2070656c 6c656e74 65737175 65206575   pellentesque eu
 0d80 2c207072 65746975 6d207175 69732c20  , pretium quis, 
 0d90 73656d2e 204e756c 6c612063 6f6e7365  sem. Nulla conse
 0da0 71756174 206d6173 73612071 75697320  quat massa quis 
 0db0 656e696d 2e20446f 6e656320 70656465  enim. Donec pede
 0dc0 206a7573 746f2c20 6672696e 67696c6c   justo, fringill
 0dd0 61207665 6c2c2061 6c697175 6574206e  a vel, aliquet n

 <snip 280 lines>

 1f50 20536564 206d6167 6e6100              Sed magna.

Okay, so this looks like we expected it to look like. The whole file is put into the final executable. Not good.

Let's check our improved version:

libgood.so:     file format elf64-x86-64
Contents of section .rodata:
 0c90 4c6f7265 6d41656e 65616e43 756d446f  LoremAeneanCumDo
 0ca0 6e65634e 756c6c61 496e4e75 6c6c616d  necNullaInNullam
 0cb0 496e7465 67657256 6976616d 7573416c  IntegerVivamusAl
 0cc0 69717561 6d506861 73656c6c 75735175  iquamPhasellusQu
 0cd0 69737175 65457469 616d4375 72616269  isqueEtiamCurabi
 0ce0 7475724e 616d5365 64467573 63655665  turNamSedFusceVe
 0cf0 73746962 756c756d 43757261 653b5065  stibulumCurae;Pe
 0d00 6c6c656e 74657371 75655375 7370656e  llentesqueSuspen
 0d10 64697373 65557450 726f696e 4d6f7262  disseUtProinMorb
 0d20 69447569 73437261 734e756e 634d6165  iDuisCrasNuncMae
 0d30 63656e61 73507261 6573656e 74        cenasPraesent

Yes. That's the whole .rodata section. Looks like everything got cleanly deduplicated and there's no trace of the full Lorem Ipsum. Nice!

Now go, and use this pattern to make your Zig programs smaller, faster and better! Hush hush!

Zig NEWS

Cool Zig Patterns - Comptime String Interning

Latest comments (0)

Read next

[neovim plugin]Improving the Experience of Writing Zig in Neovim by Combining Zig and Neovim

Running sklearn models in Zig

Zig Cross Compiling

Easy web requests in Zig with Client.fetch