fix multithreaded shader reloading and add work queue - vtgl - terminal emulator implemented in OpenGL

Commit: 7799e7695bfe6f863c15a872c4549f7c73741d94
Parent: 26e708a0acd69e6e7c9ecfe1ed0a0feb27251105
Author: Randy Palamar
Date:   Sun,  8 Dec 2024 16:25:13 -0700

fix multithreaded shader reloading and add work queue

The main thread does not hold the GL context so it cannot reload
shaders. To solve this we add a work queue for the render thread
which can also be used for other things.

Diffstat:
M debug.h  | 10 +++++-----
M platform_linux_common.c  | 2 --
M util.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
M util.h  | 25 ++++++++++++++++---------
M vtgl.c  | 140 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M vtgl.h  | 5 +++++

6 files changed, 151 insertions(+), 81 deletions(-)
diff --git a/debug.h b/debug.h
@@ -105,11 +105,11 @@ typedef struct {
 static DebugTable g_debug_table;
 
 #define RECORD_DEBUG_EVENT_COMMON(counter, event_type) \
-	u64 event_index = __atomic_fetch_add(&g_debug_table.event_array_event_index, 1, __ATOMIC_RELAXED); \
-	ASSERT((event_index & 0xFFFFFFFF) < MAX_DEBUG_EVENT_COUNT);                                  \
-	DebugEvent *event = g_debug_table.events[event_index >> 32] + (event_index & 0xFFFFFFFF);    \
-	event->clock          = __rdtsc();                                                           \
-	event->metadata_index = counter;                                                             \
+	u64 event_index = atomic_fetch_add(&g_debug_table.event_array_event_index, 1);            \
+	ASSERT((event_index & 0xFFFFFFFF) < MAX_DEBUG_EVENT_COUNT);                               \
+	DebugEvent *event = g_debug_table.events[event_index >> 32] + (event_index & 0xFFFFFFFF); \
+	event->clock          = __rdtsc();                                                        \
+	event->metadata_index = counter;                                                          \
 	event->type           = event_type
 
 #define RECORD_DEBUG_EVENT(counter, event_type)         \
diff --git a/platform_linux_common.c b/platform_linux_common.c
@@ -55,8 +55,6 @@
 #define OS_MAP_READ    PROT_READ
 #define OS_MAP_PRIVATE MAP_PRIVATE
 
-#define atomic_exchange_n(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_SEQ_CST)
-
 struct __attribute__((aligned(16))) stack_base {
 	void (*entry)(struct stack_base *stack);
 	Arena                 thread_arena;
diff --git a/util.c b/util.c
@@ -90,6 +90,56 @@ normalize_range(Range r)
 	return result;
 }
 
+/* NOTE(rnp): based on nullprogram's lock-free, concurrent,
+ * generic queue in 32 bits */
+static i32
+work_queue_push(u32 *q, u32 capacity)
+{
+	ASSERT(ISPOWEROFTWO(capacity));
+	u32 r    = atomic_load(q);
+	i32 mask = capacity - 1;
+	i32 head =  r         & mask;
+	i32 tail = (r >> 16)  & mask;
+	i32 next = (head + 1) & mask;
+	/* NOTE(rnp): prevent an overflow into the tail on commit */
+	if (r & 0x8000) atomic_and(q, ~0x8000u);
+	return next == tail ? -1 : head;
+}
+
+static void
+work_queue_push_commit(u32 *q)
+{
+	atomic_fetch_add(q, 1);
+}
+
+static i32
+work_queue_pop(u32 *q, u32 capacity)
+{
+	ASSERT(ISPOWEROFTWO(capacity));
+	u32 r    = atomic_load(q);
+	i32 mask = capacity - 1;
+	i32 head =  r        & mask;
+	i32 tail = (r >> 16) & mask;
+	return head == tail ? -1 : tail;
+}
+
+static void
+work_queue_pop_commit(u32 *q)
+{
+	atomic_fetch_add(q, 0x10000u);
+}
+
+static b32
+work_queue_empty(u32 *q, u32 capacity)
+{
+	ASSERT(ISPOWEROFTWO(capacity));
+	u32 r    = atomic_load(q);
+	i32 mask = capacity - 1;
+	i32 head =  r        & mask;
+	i32 tail = (r >> 16) & mask;
+	return head == tail;
+}
+
 static void
 mem_copy(void *src, void *dest, size len)
 {
diff --git a/util.h b/util.h
@@ -59,7 +59,6 @@ typedef struct {
 } while(0)
 
 typedef struct Variable {
-	enum variable_type type;
 	union {
 		b32 b32;
 		u32 u32;
@@ -73,6 +72,7 @@ typedef struct Variable {
 		VariableLink      group;
 		SLLVariableVector vector;
 	};
+	enum variable_type type;
 } Variable;
 
 enum cell_attribute {
@@ -148,14 +148,6 @@ typedef struct {
 	RingBuf     log;
 	LineBuf     lines;
 	Framebuffer fb;
-	/* NOTE: the position of the cursor the last time a new line was blitted
-	 * and the index of the line. This is needed because we blit whole lines
-	 * at a time unlike traditional terminal emulators which just operate as
-	 * a state machine. Any time a line hasn't played to completion we must
-	 * restart it from the original location lest it unintentionally cause a
-	 * screen scroll. */
-	iv2         last_cursor_pos;
-	size        last_line_idx;
 } TermView;
 
 enum terminal_mode {
@@ -433,6 +425,17 @@ typedef struct RenderCtx {
 	Arena             a;
 } RenderCtx;
 
+enum work_queue_entry_type {
+	WQ_FILL_RENDERBUFFER,
+	WQ_SHADER_RELOAD,
+	WQ_WINDOW_RESIZE,
+};
+
+typedef struct {
+	void *ctx;
+	enum work_queue_entry_type type;
+} work_queue_entry;
+
 typedef struct Term {
 	GLCtx     gl;
 	FontAtlas fa;
@@ -440,6 +443,10 @@ typedef struct Term {
 	Arena     arena_for_frame;
 	TempArena temp_arena;
 
+	work_queue_entry *work_queue_items;
+	u32               work_queue_capacity;
+	u32               work_queue;
+
 	InteractionState interaction;
 
 	Selection selection;
diff --git a/vtgl.c b/vtgl.c
@@ -34,11 +34,21 @@
 "    gl_Position = u_Pmat * vec4(vertex_position, 0.0, 1.0);\n"  \
 "}\n"
 
-typedef struct {
-	TerminalMemory *memory;
-	s8  info;
-	u32 stage;
-} shader_reload_ctx;
+static void
+set_projection_matrix(GLCtx *gl, u32 stage)
+{
+	f32 w = gl->window_size.w;
+	f32 h = gl->window_size.h;
+
+	f32 pmat[4 * 4] = {
+		2.0/w,   0.0,      0.0,    -1.0,
+		0.0,     2.0/h,    0.0,    -1.0,
+		0.0,     0.0,     -1.0,     0.0,
+		0.0,     0.0,      0.0,     1.0,
+	};
+
+	glProgramUniformMatrix4fv(gl->programs[stage], SHADER_PMAT_LOC, 1, GL_TRUE, pmat);
+}
 
 static u32
 compile_shader(Arena a, u32 type, s8 shader)
@@ -93,6 +103,13 @@ program_from_shader_text(s8 vertex, s8 fragment, Arena a)
 	return pid;
 }
 
+typedef struct {
+	Term *t;
+	u8   *path;
+	s8    info;
+	u32   stage;
+} queue_shader_reload_ctx;
+
 static void
 update_uniforms(GLCtx *gl, enum shader_stages stage)
 {
@@ -109,29 +126,20 @@ update_uniforms(GLCtx *gl, enum shader_stages stage)
 	}
 }
 
-static PLATFORM_FILE_WATCH_CALLBACK_FN(reload_shader)
+static void
+reload_shader(GLCtx *gl, PlatformAPI *platform, u8 *path, u32 stage, s8 info, Arena a)
 {
-	shader_reload_ctx *ctx = user_ctx;
-	PlatformAPI *platform  = &ctx->memory->platform_api;
-	Term *t     = ctx->memory->memory;
-	Arena a     = t->arena_for_frame;
-	Stream *err = &t->error_stream;
-
 	s8 fs_text = platform->read_file(path, &a);
 	if (fs_text.len) {
 		u32 program = program_from_shader_text(s8(VERTEX_SHADER_TEXT), fs_text, a);
 		if (program) {
-			glDeleteProgram(t->gl.programs[ctx->stage]);
-			t->gl.programs[ctx->stage] = program;
-			update_uniforms(&t->gl, ctx->stage);
-			stream_push_s8(err, ctx->info);
+			glDeleteProgram(gl->programs[stage]);
+			gl->programs[stage] = program;
+			update_uniforms(gl, stage);
+			set_projection_matrix(gl, stage);
 		}
 	}
-
-	if (err->widx) {
-		os_write_err_msg(stream_to_s8(err));
-		err->widx = 0;
-	}
+	if (info.len) os_write_err_msg(info);
 }
 
 static s8 fs_name[SHADER_COUNT] = {
@@ -141,32 +149,35 @@ static s8 fs_name[SHADER_COUNT] = {
 };
 
 static void
-reload_all_shaders(TerminalMemory *memory)
+reload_all_shaders(GLCtx *gl, PlatformAPI *platform, Arena a)
 {
-	PlatformAPI *platform = &memory->platform_api;
-	Term *t = memory->memory;
-
-	TempArena temp_memory = begin_temp_arena(&t->arena_for_frame);
-
-	Stream fs_path = stream_alloc(&t->arena_for_frame, KB(4));
+	Stream fs_path = stream_alloc(&a, KB(4));
 	stream_push_s8(&fs_path, g_shader_path_prefix);
 	if (fs_path.widx && fs_path.buf[fs_path.widx - 1] != platform->path_separator)
 		stream_push_byte(&fs_path, platform->path_separator);
 
-	shader_reload_ctx ctx = {0};
-	ctx.memory = memory;
-
 	i32 sidx = fs_path.widx;
 	for (u32 i = 0; i < SHADER_COUNT; i++) {
 		stream_push_s8(&fs_path, fs_name[i]);
 		stream_push_byte(&fs_path, 0);
-		ctx.stage = i;
-		reload_shader(fs_path.buf, &ctx);
+		reload_shader(gl, platform, fs_path.buf, i, (s8){0}, a);
 		fs_path.widx = sidx;
 	}
 
 	os_write_err_msg(s8("Reloaded Shaders\n"));
-	end_temp_arena(temp_memory);
+}
+
+static PLATFORM_FILE_WATCH_CALLBACK_FN(queue_shader_reload)
+{
+	queue_shader_reload_ctx *ctx = user_ctx;
+	i32 index = work_queue_push(&ctx->t->work_queue, ctx->t->work_queue_capacity);
+	/* NOTE(rnp): if we ever fill this up we need to resize the queue */
+	ASSERT(index != -1);
+	work_queue_push_commit(&ctx->t->work_queue);
+
+	ctx->path = path;
+	ctx->t->work_queue_items[index].type = WQ_SHADER_RELOAD;
+	ctx->t->work_queue_items[index].ctx  = ctx;
 }
 
 static v4
@@ -192,24 +203,6 @@ pressed_last_frame(ButtonState *button)
 	return result;
 }
 
-static void
-set_projection_matrix(GLCtx *gl)
-{
-	f32 w = gl->window_size.w;
-	f32 h = gl->window_size.h;
-
-	f32 pmat[4 * 4] = {
-		2.0/w,   0.0,      0.0,    -1.0,
-		0.0,     2.0/h,    0.0,    -1.0,
-		0.0,     0.0,     -1.0,     0.0,
-		0.0,     0.0,      0.0,     1.0,
-	};
-
-	glProgramUniformMatrix4fv(gl->programs[SHADER_RENDER], SHADER_PMAT_LOC, 1, GL_TRUE, pmat);
-	glProgramUniformMatrix4fv(gl->programs[SHADER_RECTS],  SHADER_PMAT_LOC, 1, GL_TRUE, pmat);
-	glProgramUniformMatrix4fv(gl->programs[SHADER_POST],   SHADER_PMAT_LOC, 1, GL_TRUE, pmat);
-}
-
 static v2
 get_cell_size(FontAtlas *fa)
 {
@@ -307,7 +300,8 @@ resize(Term *t, PlatformAPI *platform, iv2 window_size)
 	sp->term_size_in_pixels = gl->window_size;
 	sp->term_size_in_cells  = t->size;
 
-	set_projection_matrix(gl);
+	for (u32 i = 0; i < SHADER_COUNT; i++)
+		set_projection_matrix(gl, i);
 
 	gl->flags &= ~RESIZE_RENDERER;
 }
@@ -1107,7 +1101,10 @@ DEBUG_EXPORT VTGL_INITIALIZE_FN(vtgl_initialize)
 	initialize_framebuffer(&t->views[0].fb, t->size);
 	initialize_framebuffer(&t->views[1].fb, t->size);
 
-	shader_reload_ctx *shader_ctxs = alloc(&a, shader_reload_ctx, SHADER_COUNT);
+	t->work_queue_items    = alloc(&a, typeof(*t->work_queue_items), 1 << 6);
+	t->work_queue_capacity = 1 << 6;
+
+	queue_shader_reload_ctx *reload_ctxs = alloc(&a, typeof(*reload_ctxs), SHADER_COUNT);
 
 	s8 shader_infos[SHADER_COUNT] = {
 		[SHADER_POST]   = s8("Post Processing Shader Reloaded!\n"),
@@ -1121,13 +1118,13 @@ DEBUG_EXPORT VTGL_INITIALIZE_FN(vtgl_initialize)
 		if (path.widx && path.buf[path.widx - 1] != memory->platform_api.path_separator)
 			stream_push_byte(&path, memory->platform_api.path_separator);
 
-		shader_reload_ctx *src = shader_ctxs + i;
-		src->info   = shader_infos[i];
-		src->stage  = i;
-		src->memory = memory;
+		queue_shader_reload_ctx *src = reload_ctxs + i;
+		src->info  = shader_infos[i];
+		src->stage = i;
+		src->t     = t;
 		stream_push_s8(&path, fs_name[i]);
 		stream_push_byte(&path, 0);
-		memory->platform_api.add_file_watch(path.buf, reload_shader, src);
+		memory->platform_api.add_file_watch(path.buf, queue_shader_reload, src);
 		a.beg = path.buf + path.widx;
 	}
 
@@ -1212,7 +1209,7 @@ DEBUG_EXPORT VTGL_INITIALIZE_FN(vtgl_initialize)
 
 	glActiveTexture(GL_TEXTURE0);
 
-	reload_all_shaders(memory);
+	reload_all_shaders(&t->gl, &memory->platform_api, a);
 
 	return requested_size;
 }
@@ -1221,9 +1218,7 @@ DEBUG_EXPORT VTGL_ACTIVE_SELECTION_FN(vtgl_active_selection)
 {
 	Term *t      = memory->memory;
 	Range result = t->selection.range;
-	if (out)
-		stream_push_selection(out, t->views[t->view_idx].fb.rows,
-		                      t->selection.range, t->size.w);
+	if (out) stream_push_selection(out, t->views[t->view_idx].fb.rows, result, t->size.w);
 	return result;
 }
 
@@ -1237,8 +1232,22 @@ DEBUG_EXPORT VTGL_RENDER_FRAME_FN(vtgl_render_frame)
 
 	TempArena temp_arena = begin_temp_arena(&arena);
 
+	i32 queue_item;
+	while ((queue_item = work_queue_pop(&t->work_queue, t->work_queue_capacity)) != -1) {
+		work_queue_pop_commit(&t->work_queue);
+		work_queue_entry *entry = t->work_queue_items + queue_item;
+		switch (entry->type) {
+		case WQ_SHADER_RELOAD: {
+			queue_shader_reload_ctx *ctx = entry->ctx;
+			reload_shader(&t->gl, &memory->platform_api, ctx->path, ctx->stage,
+			              ctx->info, arena);
+		} break;
+		default: INVALID_CODE_PATH;
+		}
+	}
+
 	if (input->executable_reloaded) {
-		reload_all_shaders(memory);
+		reload_all_shaders(&t->gl, &memory->platform_api, arena);
 	}
 
 	/* NOTE: default state which can be overwritten later in the frame */
@@ -1380,7 +1389,8 @@ DEBUG_EXPORT VTGL_FRAME_STEP_FN(vtgl_frame_step)
 
 	END_TIMED_BLOCK();
 
-	return t->gl.queued_render || input->window_refreshed || t->gl.flags & DRAW_DEBUG_OVERLAY;
+	return t->gl.queued_render || input->window_refreshed || t->gl.flags & DRAW_DEBUG_OVERLAY
+	       || !work_queue_empty(&t->work_queue, t->work_queue_capacity);
 }
 
 #ifdef _DEBUG
diff --git a/vtgl.h b/vtgl.h
@@ -27,6 +27,11 @@
 #define DEBUG_EXPORT static
 #endif
 
+#define atomic_and(ptr, n)          __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE);
+#define atomic_fetch_add(ptr, n)    __atomic_fetch_add(ptr, n, __ATOMIC_RELEASE);
+#define atomic_load(ptr)            __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
+#define atomic_exchange_n(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_SEQ_CST)
+
 #define PI       3.1415926535897932384f
 
 #define KB(a)    ((a) << 10ULL)

M	debug.h	\|	10	+++++-----
M	platform_linux_common.c	\|	2	--
M	util.c	\|	50	++++++++++++++++++++++++++++++++++++++++++++++++++
M	util.h	\|	25	++++++++++++++++---------
M	vtgl.c	\|	140	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	vtgl.h	\|	5	+++++