Refactor code structure and improve performance by optimizing OpenCL kernel and adding timing macros; update Cargo.toml for release profile settings; enhance main.rs and sinks.rs for better readability and organization.

2026-04-28 23:00:26 +02:00 · 2025-04-11 16:04:10 +02:00
parent c554b1f164
commit b04c01e737
11 changed files with 395 additions and 184 deletions
--- a/schemsearch-ocl-matcher/src/kernel.cl
+++ b/schemsearch-ocl-matcher/src/kernel.cl
@@ -1,51 +1,35 @@
-__kernel void add(__global int* result,
-                 __global uint* schem,
-                 __global uint* pattern,
-                 const int p_width,
-                 const int p_height,
-                 const int p_depth,
-                 const uint air_id,
-                 const int ignore_air,
-                 const int air_as_any,
-                 const int skipamount) {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-    
-    int width = get_global_size(0);
-    int height = get_global_size(1);
-    int depth = get_global_size(2);
-    
-    if (x > width - p_width || y > height - p_height || z > depth - p_depth) {
-        return;
-    }
-    
-    int wrong_blocks = 0;
-    for (int py = 0; py < p_height; py++) {
-        for (int pz = 0; pz < p_depth; pz++) {
-            for (int px = 0; px < p_width; px++) {
-                int s_idx = (x + px) + width * ((z + pz) + (y + py) * depth);
-                int p_idx = px + p_width * (pz + py * p_depth);
-                
-                uint schem_block = schem[s_idx];
-                uint pattern_block = pattern[p_idx];
+// Use 3d_img
+// Weniger Allocs an Buffern
+// Pattern Parallelisieren mit Local Workern?
+// To Match on GPU
+// Weniger Worker, Mehr Parameter!
+// Pattern als Kernel Konstante

-                if ((ignore_air && schem_block != air_id) || (air_as_any && pattern_block != air_id)) {
-                    continue;
-                }
+__kernel void add(__global int *result, __global uint *schem,
+                  __constant uint *pattern, const int width, const int height,
+                  const int depth, const int p_width, const int p_height,
+                  const int p_depth, const uint air_id, const int ignore_air,
+                  const int air_as_any, const int skipamount) {
+  int x = get_global_id(0);
+  int y = get_global_id(2);
+  int z = get_global_id(1);

-                if (schem_block != pattern_block) {
-                    wrong_blocks++;
-                    if (wrong_blocks > skipamount) {
-                        int idx = x + z * width + y * width * depth;
-                        result[idx] = wrong_blocks;
-                        return;
-                    }
-                }
-            }
-        }
+  int wrong_blocks = 0;
+  for (int py = 0; py < p_height; py++) {
+    for (int pz = 0; pz < p_depth; pz++) {
+      for (int px = 0; px < p_width; px++) {
+        // if ((ignore_air && schem_block != air_id) || (air_as_any &&
+        // pattern_block != air_id)) {
+        //     continue; // TODO: PROBLEM!
+        // }
+
+        wrong_blocks +=
+            schem[(x + px) + width * ((z + pz) + (y + py) * depth)] !=
+            pattern[px + p_width * (pz + py * p_depth)];
+      }
    }
-    
-    int idx = x + z * width + y * width * depth;
-    result[idx] = wrong_blocks;
+  }
+
+  int idx = x + z * width + y * width * depth;
+  result[idx] = wrong_blocks;
 }
--- a/schemsearch-ocl-matcher/src/lib.rs
+++ b/schemsearch-ocl-matcher/src/lib.rs
@@ -1,8 +1,9 @@
-use std::sync::OnceLock;
-use ocl::{Buffer, MemFlags, ProQue, core};
-use ocl::SpatialDims::Three;
-use schemsearch_common::{Match, SearchBehavior};
 use math::round::ceil;
+use ocl::SpatialDims::Three;
+use ocl::{core, Buffer, Image, MemFlags, ProQue};
+use schemsearch_common::{time, Match, SearchBehavior};
+use std::sync::OnceLock;
+use std::time;

 const KERNEL: &str = include_str!("kernel.cl");

@@ -20,7 +21,15 @@ pub fn ocl_search(
    air_id: i32,
    search_behavior: SearchBehavior,
 ) -> Result<Vec<Match>, String> {
-    search_ocl(schem, schem_size, pattern, pattern_size, air_id, search_behavior).map_err(|e| e.to_string())
+    search_ocl(
+        schem,
+        schem_size,
+        pattern,
+        pattern_size,
+        air_id,
+        search_behavior,
+    )
+    .map_err(|e| e.to_string())
 }

 fn search_ocl(
@@ -38,59 +47,79 @@ fn search_ocl(
    let schem_width = schem_size[0];
    let schem_height = schem_size[1];
    let schem_length = schem_size[2];
-    
+
    let pattern_blocks = (pattern_width * pattern_height * pattern_length) as f32;
-    
-    let skip_amount = ceil((pattern_blocks * (1.0 - search_behavior.threshold)) as f64, 0) as i32;
+
+    let skip_amount = ceil(
+        (pattern_blocks * (1.0 - search_behavior.threshold)) as f64,
+        0,
+    ) as i32;

    let cell = &PRO_QUEU_CELL;
-    let mut pro_que = cell.get_or_init(|| {
-        ProQue::builder()
-            .src(KERNEL)
-            .build().unwrap()
-    }).clone();
+    let mut pro_que = time!(get_pro_que, {
+        cell.get_or_init(|| ProQue::builder().src(KERNEL).build().unwrap())
+            .clone()
+    });

+    pro_que.set_dims(Three(schem_width, schem_length, schem_height));

-    pro_que.set_dims(Three(schem_width, schem_height, schem_length));
+    let buffer = time!(create_result_buffer, {
+        Buffer::builder()
+            .queue(pro_que.queue().clone())
+            .flags(MemFlags::new().read_write())
+            .fill_val(-1)
+            .len(schem.len())
+            .build()
+    })?;

-    let buffer = Buffer::builder()
-        .queue(pro_que.queue().clone())
-        .flags(MemFlags::new().read_write())
-        .fill_val(-1)
-        .len(schem.len())
-        .build()?;
+    let schem_buffer = time!(create_schen_buffer, {
+        create_schem_buffer(schem, &pro_que)
+    })?;

-    let schem_buffer = create_schem_buffer(schem, &pro_que)?;
+    let pattern_buffer = time!(create_pattern_buffer, {
+        create_schem_buffer(pattern, &pro_que)
+    })?;

-    let pattern_buffer = create_schem_buffer(pattern, &pro_que)?;
+    let kernel = time!(create_kernel, {
+        pro_que
+            .kernel_builder("add")
+            .arg(&buffer)
+            .arg(&schem_buffer)
+            .arg(&pattern_buffer)
+            .arg(schem_width as i32)
+            .arg(schem_height as i32)
+            .arg(schem_length as i32)
+            .arg(pattern_width as i32)
+            .arg(pattern_height as i32)
+            .arg(pattern_length as i32)
+            .arg(air_id)
+            .arg(search_behavior.ignore_air as u32)
+            .arg(search_behavior.air_as_any as u32)
+            .arg(skip_amount)
+            .build()
+    })?;

-    let kernel = pro_que.kernel_builder("add")
-        .arg(&buffer)
-        .arg(&schem_buffer)
-        .arg(&pattern_buffer)
-        .arg(pattern_width as i32)
-        .arg(pattern_height as i32)
-        .arg(pattern_length as i32)
-        .arg(air_id) // air_id
-        .arg(search_behavior.ignore_air as u32) // ignore_air
-        .arg(search_behavior.air_as_any as u32) // air_as_any
-        .arg(skip_amount)
-        .build()?;
+    unsafe {
+        time!(run_kernel, { kernel.enq() })?;
+    }

-    unsafe { kernel.enq()?; }
+    let mut vec = vec![0; buffer.len()];
+    time!(read_buffer, {
+        buffer.read(&mut vec).enq()?;
+    });

-    let mut vec = vec![0i32; buffer.len()];
-    buffer.read(&mut vec).enq()?;
-    
-    Ok(vec.into_iter().enumerate().filter(|(_, v)| *v < skip_amount && *v != -1).map(|(i, v)| {
-        Match {
+    Ok(vec
+        .into_iter()
+        .enumerate()
+        .filter(|(_, v)| *v < skip_amount && *v != -1)
+        .map(|(i, v)| Match {
            x: (i % schem_width) as u16,
            y: ((i / (schem_width * schem_length)) % schem_height) as u16,
            z: ((i / schem_width) % schem_length) as u16,

            percent: (pattern_blocks - v as f32) / pattern_blocks,
-        }
-    }).collect())
+        })
+        .collect())
 }

 fn create_schem_buffer(pattern: &[i32], pro_que: &ProQue) -> ocl::Result<Buffer<i32>> {
@@ -98,6 +127,7 @@ fn create_schem_buffer(pattern: &[i32], pro_que: &ProQue) -> ocl::Result<Buffer<
        .queue(pro_que.queue().clone())
        .flags(MemFlags::new().read_only())
        .len(pattern.len())
+        // Host Memory Map?
        .copy_host_slice(pattern)
        .build()
-}
+}