decode.glsl (6835B)
1 /* See LICENSE for license details. */ 2 3 /* NOTE(rnp): invoked with samples x channels x transmits 4 * Each instance extracts a single time sample from a single channel for all transmits 5 * and does a dot product with the appropriate row of the bound hadamard matrix 6 * (unless decode_mode == DECODE_MODE_NONE). The result of this dot product is stored in the 7 * output. In bulk this has the effect of computing a matrix multiply of the 8 * sample-transmit plane with the bound hadamard matrix. 9 */ 10 11 #if DataKind == DataKind_Float32 12 #define INPUT_DATA_TYPE float 13 #define SAMPLE_DATA_TYPE float 14 #define SAMPLE_TYPE_CAST(x) (x) 15 #elif DataKind == DataKind_Float32Complex 16 #define INPUT_DATA_TYPE vec2 17 #define SAMPLE_DATA_TYPE vec2 18 #define SAMPLE_TYPE_CAST(x) (x) 19 #elif DataKind == DataKind_Int16Complex 20 #define INPUT_DATA_TYPE int 21 #define SAMPLE_DATA_TYPE vec2 22 #define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16) 23 #elif DataKind == DataKind_Int16 24 #define INPUT_DATA_TYPE int 25 #define RF_SAMPLES_PER_INDEX 2 26 #if DilateOutput 27 #define SAMPLE_DATA_TYPE vec4 28 #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0) 29 #else 30 #define SAMPLE_DATA_TYPE vec2 31 #define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16) 32 #define OUTPUT_SAMPLES_PER_INDEX 2 33 #endif 34 #else 35 #error unsupported data kind for Decode 36 #endif 37 38 #ifndef OUTPUT_SAMPLES_PER_INDEX 39 #define OUTPUT_SAMPLES_PER_INDEX 1 40 #endif 41 42 #ifndef RF_SAMPLES_PER_INDEX 43 #define RF_SAMPLES_PER_INDEX 1 44 #endif 45 46 layout(std430, binding = 1) readonly restrict buffer buffer_1 { 47 INPUT_DATA_TYPE rf_data[]; 48 }; 49 50 layout(std430, binding = 2) writeonly restrict buffer buffer_2 { 51 INPUT_DATA_TYPE out_rf_data[]; 52 }; 53 54 layout(std430, binding = 3) writeonly restrict buffer buffer_3 { 55 SAMPLE_DATA_TYPE out_data[]; 56 }; 57 58 layout(r32f, binding = 0) readonly restrict uniform image2D hadamard; 59 60 SAMPLE_DATA_TYPE sample_rf_data(uint index) 61 { 62 SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]); 63 return result; 64 } 65 66 #if UseSharedMemory 67 shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount]; 68 void run_decode_large(void) 69 { 70 uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; 71 uint channel = gl_GlobalInvocationID.y; 72 uint transmit = gl_GlobalInvocationID.z * ToProcess; 73 74 uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; 75 uint thread_index = gl_LocalInvocationIndex; 76 77 uint samples_per_thread = rf.length() / thread_count; 78 uint leftover_samples = rf.length() % thread_count; 79 uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples); 80 81 uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX + 82 TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x); 83 84 for (uint i = 0; i < samples_this_thread; i++) { 85 uint index = i * thread_count + thread_index; 86 rf[index] = rf_data[rf_offset + index]; 87 } 88 89 barrier(); 90 91 SAMPLE_DATA_TYPE result[ToProcess]; 92 if (time_sample < OutputTransmitStride) { 93 for (uint i = 0; i < ToProcess; i++) 94 result[i] = SAMPLE_DATA_TYPE(0); 95 96 for (int j = 0; j < TransmitCount; j++) { 97 SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]); 98 for (uint i = 0; i < ToProcess; i++) 99 result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; 100 } 101 102 for (uint i = 0; i < ToProcess; i++) 103 result[i] /= float(TransmitCount); 104 } 105 106 /* NOTE(rnp): DO NOT combine with above; compiler shits the bed on TransmitCount == 80 107 * and it kills performance. reinvestigate when we further optimize */ 108 if (time_sample < OutputTransmitStride) { 109 uint out_off = OutputChannelStride * channel + 110 OutputTransmitStride * transmit + 111 OutputSampleStride * time_sample; 112 113 for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) 114 if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount) 115 out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; 116 } 117 } 118 #endif 119 120 void run_decode_small(void) 121 { 122 uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; 123 uint channel = gl_GlobalInvocationID.y; 124 uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; 125 126 if (time_sample < OutputTransmitStride) { 127 INPUT_DATA_TYPE rf[TransmitCount]; 128 for (int j = 0; j < TransmitCount; j++) 129 rf[j] = rf_data[rf_offset + j]; 130 131 SAMPLE_DATA_TYPE result[TransmitCount]; 132 for (int j = 0; j < TransmitCount; j++) 133 result[j] = SAMPLE_DATA_TYPE(0); 134 135 for (int i = 0; i < TransmitCount; i++) { 136 SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]); 137 for (int j = 0; j < TransmitCount; j++) { 138 result[j] += imageLoad(hadamard, ivec2(i, j)).x * s; 139 } 140 } 141 142 for (int i = 0; i < TransmitCount; i++) 143 result[i] /= float(TransmitCount); 144 145 uint out_off = OutputChannelStride * channel + 146 OutputSampleStride * time_sample; 147 for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride) 148 out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; 149 } 150 } 151 152 void main() 153 { 154 switch (DecodeMode) { 155 case DecodeMode_None:{ 156 uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; 157 uint channel = gl_GlobalInvocationID.y; 158 uint transmit = gl_GlobalInvocationID.z; 159 160 if (time_sample < OutputTransmitStride) { 161 uint in_off = (InputChannelStride * channel + 162 InputTransmitStride * transmit + 163 InputSampleStride * time_sample) / RF_SAMPLES_PER_INDEX; 164 165 uint out_off = (OutputChannelStride * channel + 166 OutputTransmitStride * transmit + 167 OutputSampleStride * time_sample) / OUTPUT_SAMPLES_PER_INDEX; 168 169 out_data[out_off] = sample_rf_data(in_off); 170 } 171 }break; 172 case DecodeMode_Hadamard:{ 173 if (u_first_pass) { 174 uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; 175 uint channel = gl_GlobalInvocationID.y; 176 uint transmit = gl_GlobalInvocationID.z * ToProcess; 177 if (time_sample < InputTransmitStride) { 178 uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; 179 uint in_off = (InputChannelStride * channel + InputSampleStride * time_sample); 180 #if UseSharedMemory 181 in_off += InputTransmitStride * transmit; 182 out_off += transmit; 183 for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) { 184 if (transmit + i < TransmitCount) 185 out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; 186 } 187 #else 188 for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride) 189 out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; 190 #endif 191 } 192 } else { 193 #if UseSharedMemory 194 run_decode_large(); 195 #else 196 run_decode_small(); 197 #endif 198 } 199 }break; 200 } 201 }