ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

decode.glsl (6835B)


      1 /* See LICENSE for license details. */
      2 
      3 /* NOTE(rnp): invoked with samples x channels x transmits
      4  * Each instance extracts a single time sample from a single channel for all transmits
      5  * and does a dot product with the appropriate row of the bound hadamard matrix
      6  * (unless decode_mode == DECODE_MODE_NONE). The result of this dot product is stored in the
      7  * output. In bulk this has the effect of computing a matrix multiply of the
      8  * sample-transmit plane with the bound hadamard matrix.
      9  */
     10 
     11 #if   DataKind == DataKind_Float32
     12 	#define INPUT_DATA_TYPE      float
     13 	#define SAMPLE_DATA_TYPE     float
     14 	#define SAMPLE_TYPE_CAST(x)  (x)
     15 #elif DataKind == DataKind_Float32Complex
     16 	#define INPUT_DATA_TYPE      vec2
     17 	#define SAMPLE_DATA_TYPE     vec2
     18 	#define SAMPLE_TYPE_CAST(x)  (x)
     19 #elif DataKind == DataKind_Int16Complex
     20 	#define INPUT_DATA_TYPE      int
     21 	#define SAMPLE_DATA_TYPE     vec2
     22 	#define SAMPLE_TYPE_CAST(x)  vec2(((x) << 16) >> 16, (x) >> 16)
     23 #elif DataKind == DataKind_Int16
     24 	#define INPUT_DATA_TYPE      int
     25 	#define RF_SAMPLES_PER_INDEX 2
     26 	#if DilateOutput
     27 		#define SAMPLE_DATA_TYPE    vec4
     28 		#define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0)
     29 	#else
     30 		#define SAMPLE_DATA_TYPE    vec2
     31 		#define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16)
     32 		#define OUTPUT_SAMPLES_PER_INDEX 2
     33 	#endif
     34 #else
     35 	#error unsupported data kind for Decode
     36 #endif
     37 
     38 #ifndef OUTPUT_SAMPLES_PER_INDEX
     39 	#define OUTPUT_SAMPLES_PER_INDEX 1
     40 #endif
     41 
     42 #ifndef RF_SAMPLES_PER_INDEX
     43 	#define RF_SAMPLES_PER_INDEX 1
     44 #endif
     45 
     46 layout(std430, binding = 1) readonly restrict buffer buffer_1 {
     47 	INPUT_DATA_TYPE rf_data[];
     48 };
     49 
     50 layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
     51 	INPUT_DATA_TYPE out_rf_data[];
     52 };
     53 
     54 layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
     55 	SAMPLE_DATA_TYPE out_data[];
     56 };
     57 
     58 layout(r32f, binding = 0) readonly restrict uniform image2D  hadamard;
     59 
     60 SAMPLE_DATA_TYPE sample_rf_data(uint index)
     61 {
     62 	SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]);
     63 	return result;
     64 }
     65 
     66 #if UseSharedMemory
     67 shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
     68 void run_decode_large(void)
     69 {
     70 	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
     71 	uint channel     = gl_GlobalInvocationID.y;
     72 	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
     73 
     74 	uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
     75 	uint thread_index = gl_LocalInvocationIndex;
     76 
     77 	uint samples_per_thread  = rf.length() / thread_count;
     78 	uint leftover_samples    = rf.length() % thread_count;
     79 	uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
     80 
     81 	uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX +
     82 	                  TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x);
     83 
     84 	for (uint i = 0; i < samples_this_thread; i++) {
     85 		uint index = i * thread_count + thread_index;
     86 		rf[index] = rf_data[rf_offset + index];
     87 	}
     88 
     89 	barrier();
     90 
     91 	SAMPLE_DATA_TYPE result[ToProcess];
     92 	if (time_sample < OutputTransmitStride) {
     93 		for (uint i = 0; i < ToProcess; i++)
     94 			result[i] = SAMPLE_DATA_TYPE(0);
     95 
     96 		for (int j = 0; j < TransmitCount; j++) {
     97 			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
     98 			for (uint i = 0; i < ToProcess; i++)
     99 				result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
    100 		}
    101 
    102 		for (uint i = 0; i < ToProcess; i++)
    103 			result[i] /= float(TransmitCount);
    104 	}
    105 
    106 	/* NOTE(rnp): DO NOT combine with above; compiler shits the bed on TransmitCount == 80
    107 	 * and it kills performance. reinvestigate when we further optimize */
    108 	if (time_sample < OutputTransmitStride) {
    109 		uint out_off = OutputChannelStride  * channel +
    110 		               OutputTransmitStride * transmit +
    111 		               OutputSampleStride   * time_sample;
    112 
    113 		for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride)
    114 			if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
    115 				out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
    116 	}
    117 }
    118 #endif
    119 
    120 void run_decode_small(void)
    121 {
    122 	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
    123 	uint channel     = gl_GlobalInvocationID.y;
    124 	uint rf_offset   = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
    125 
    126 	if (time_sample < OutputTransmitStride) {
    127 		INPUT_DATA_TYPE rf[TransmitCount];
    128 		for (int j = 0; j < TransmitCount; j++)
    129 			rf[j] = rf_data[rf_offset + j];
    130 
    131 		SAMPLE_DATA_TYPE result[TransmitCount];
    132 		for (int j = 0; j < TransmitCount; j++)
    133 			result[j] = SAMPLE_DATA_TYPE(0);
    134 
    135 		for (int i = 0; i < TransmitCount; i++) {
    136 			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]);
    137 			for (int j = 0; j < TransmitCount; j++) {
    138 				result[j] += imageLoad(hadamard, ivec2(i, j)).x * s;
    139 			}
    140 		}
    141 
    142 		for (int i = 0; i < TransmitCount; i++)
    143 			result[i] /= float(TransmitCount);
    144 
    145 		uint out_off = OutputChannelStride  * channel +
    146 		               OutputSampleStride   * time_sample;
    147 		for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride)
    148 			out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
    149 	}
    150 }
    151 
    152 void main()
    153 {
    154 	switch (DecodeMode) {
    155 	case DecodeMode_None:{
    156 		uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
    157 		uint channel     = gl_GlobalInvocationID.y;
    158 		uint transmit    = gl_GlobalInvocationID.z;
    159 
    160 		if (time_sample < OutputTransmitStride) {
    161 			uint in_off = (InputChannelStride  * channel +
    162 			               InputTransmitStride * transmit +
    163 			               InputSampleStride   * time_sample) / RF_SAMPLES_PER_INDEX;
    164 
    165 			uint out_off = (OutputChannelStride  * channel +
    166 			                OutputTransmitStride * transmit +
    167 			                OutputSampleStride   * time_sample) / OUTPUT_SAMPLES_PER_INDEX;
    168 
    169 			out_data[out_off] = sample_rf_data(in_off);
    170 		}
    171 	}break;
    172 	case DecodeMode_Hadamard:{
    173 		if (u_first_pass) {
    174 			uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
    175 			uint channel     = gl_GlobalInvocationID.y;
    176 			uint transmit    = gl_GlobalInvocationID.z * ToProcess;
    177 			if (time_sample < InputTransmitStride) {
    178 				uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
    179 				uint in_off  = (InputChannelStride * channel + InputSampleStride  * time_sample);
    180 				#if UseSharedMemory
    181 					in_off  += InputTransmitStride * transmit;
    182 					out_off += transmit;
    183 					for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
    184 						if (transmit + i < TransmitCount)
    185 							out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
    186 					}
    187 				#else
    188 					for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
    189 						out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
    190 				#endif
    191 			}
    192 		} else {
    193 			#if UseSharedMemory
    194 				run_decode_large();
    195 			#else
    196 				run_decode_small();
    197 			#endif
    198 		}
    199 	}break;
    200 	}
    201 }