Refactor code structure and improve performance by optimizing OpenCL kernel and adding timing macros; update Cargo.toml for release profile settings; enhance main.rs and sinks.rs for better readability and organization.

This commit is contained in:
2025-04-11 16:04:10 +02:00
parent c554b1f164
commit b04c01e737
11 changed files with 395 additions and 184 deletions

View File

@@ -1,51 +1,35 @@
__kernel void add(__global int* result,
__global uint* schem,
__global uint* pattern,
const int p_width,
const int p_height,
const int p_depth,
const uint air_id,
const int ignore_air,
const int air_as_any,
const int skipamount) {
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int width = get_global_size(0);
int height = get_global_size(1);
int depth = get_global_size(2);
if (x > width - p_width || y > height - p_height || z > depth - p_depth) {
return;
}
int wrong_blocks = 0;
for (int py = 0; py < p_height; py++) {
for (int pz = 0; pz < p_depth; pz++) {
for (int px = 0; px < p_width; px++) {
int s_idx = (x + px) + width * ((z + pz) + (y + py) * depth);
int p_idx = px + p_width * (pz + py * p_depth);
uint schem_block = schem[s_idx];
uint pattern_block = pattern[p_idx];
// Use 3d_img
// Weniger Allocs an Buffern
// Pattern Parallelisieren mit Local Workern?
// To Match on GPU
// Weniger Worker, Mehr Parameter!
// Pattern als Kernel Konstante
if ((ignore_air && schem_block != air_id) || (air_as_any && pattern_block != air_id)) {
continue;
}
__kernel void add(__global int *result, __global uint *schem,
__constant uint *pattern, const int width, const int height,
const int depth, const int p_width, const int p_height,
const int p_depth, const uint air_id, const int ignore_air,
const int air_as_any, const int skipamount) {
int x = get_global_id(0);
int y = get_global_id(2);
int z = get_global_id(1);
if (schem_block != pattern_block) {
wrong_blocks++;
if (wrong_blocks > skipamount) {
int idx = x + z * width + y * width * depth;
result[idx] = wrong_blocks;
return;
}
}
}
}
int wrong_blocks = 0;
for (int py = 0; py < p_height; py++) {
for (int pz = 0; pz < p_depth; pz++) {
for (int px = 0; px < p_width; px++) {
// if ((ignore_air && schem_block != air_id) || (air_as_any &&
// pattern_block != air_id)) {
// continue; // TODO: PROBLEM!
// }
wrong_blocks +=
schem[(x + px) + width * ((z + pz) + (y + py) * depth)] !=
pattern[px + p_width * (pz + py * p_depth)];
}
}
int idx = x + z * width + y * width * depth;
result[idx] = wrong_blocks;
}
int idx = x + z * width + y * width * depth;
result[idx] = wrong_blocks;
}

View File

@@ -1,8 +1,9 @@
use std::sync::OnceLock;
use ocl::{Buffer, MemFlags, ProQue, core};
use ocl::SpatialDims::Three;
use schemsearch_common::{Match, SearchBehavior};
use math::round::ceil;
use ocl::SpatialDims::Three;
use ocl::{core, Buffer, Image, MemFlags, ProQue};
use schemsearch_common::{time, Match, SearchBehavior};
use std::sync::OnceLock;
use std::time;
const KERNEL: &str = include_str!("kernel.cl");
@@ -20,7 +21,15 @@ pub fn ocl_search(
air_id: i32,
search_behavior: SearchBehavior,
) -> Result<Vec<Match>, String> {
search_ocl(schem, schem_size, pattern, pattern_size, air_id, search_behavior).map_err(|e| e.to_string())
search_ocl(
schem,
schem_size,
pattern,
pattern_size,
air_id,
search_behavior,
)
.map_err(|e| e.to_string())
}
fn search_ocl(
@@ -38,59 +47,79 @@ fn search_ocl(
let schem_width = schem_size[0];
let schem_height = schem_size[1];
let schem_length = schem_size[2];
let pattern_blocks = (pattern_width * pattern_height * pattern_length) as f32;
let skip_amount = ceil((pattern_blocks * (1.0 - search_behavior.threshold)) as f64, 0) as i32;
let skip_amount = ceil(
(pattern_blocks * (1.0 - search_behavior.threshold)) as f64,
0,
) as i32;
let cell = &PRO_QUEU_CELL;
let mut pro_que = cell.get_or_init(|| {
ProQue::builder()
.src(KERNEL)
.build().unwrap()
}).clone();
let mut pro_que = time!(get_pro_que, {
cell.get_or_init(|| ProQue::builder().src(KERNEL).build().unwrap())
.clone()
});
pro_que.set_dims(Three(schem_width, schem_length, schem_height));
pro_que.set_dims(Three(schem_width, schem_height, schem_length));
let buffer = time!(create_result_buffer, {
Buffer::builder()
.queue(pro_que.queue().clone())
.flags(MemFlags::new().read_write())
.fill_val(-1)
.len(schem.len())
.build()
})?;
let buffer = Buffer::builder()
.queue(pro_que.queue().clone())
.flags(MemFlags::new().read_write())
.fill_val(-1)
.len(schem.len())
.build()?;
let schem_buffer = time!(create_schen_buffer, {
create_schem_buffer(schem, &pro_que)
})?;
let schem_buffer = create_schem_buffer(schem, &pro_que)?;
let pattern_buffer = time!(create_pattern_buffer, {
create_schem_buffer(pattern, &pro_que)
})?;
let pattern_buffer = create_schem_buffer(pattern, &pro_que)?;
let kernel = time!(create_kernel, {
pro_que
.kernel_builder("add")
.arg(&buffer)
.arg(&schem_buffer)
.arg(&pattern_buffer)
.arg(schem_width as i32)
.arg(schem_height as i32)
.arg(schem_length as i32)
.arg(pattern_width as i32)
.arg(pattern_height as i32)
.arg(pattern_length as i32)
.arg(air_id)
.arg(search_behavior.ignore_air as u32)
.arg(search_behavior.air_as_any as u32)
.arg(skip_amount)
.build()
})?;
let kernel = pro_que.kernel_builder("add")
.arg(&buffer)
.arg(&schem_buffer)
.arg(&pattern_buffer)
.arg(pattern_width as i32)
.arg(pattern_height as i32)
.arg(pattern_length as i32)
.arg(air_id) // air_id
.arg(search_behavior.ignore_air as u32) // ignore_air
.arg(search_behavior.air_as_any as u32) // air_as_any
.arg(skip_amount)
.build()?;
unsafe {
time!(run_kernel, { kernel.enq() })?;
}
unsafe { kernel.enq()?; }
let mut vec = vec![0; buffer.len()];
time!(read_buffer, {
buffer.read(&mut vec).enq()?;
});
let mut vec = vec![0i32; buffer.len()];
buffer.read(&mut vec).enq()?;
Ok(vec.into_iter().enumerate().filter(|(_, v)| *v < skip_amount && *v != -1).map(|(i, v)| {
Match {
Ok(vec
.into_iter()
.enumerate()
.filter(|(_, v)| *v < skip_amount && *v != -1)
.map(|(i, v)| Match {
x: (i % schem_width) as u16,
y: ((i / (schem_width * schem_length)) % schem_height) as u16,
z: ((i / schem_width) % schem_length) as u16,
percent: (pattern_blocks - v as f32) / pattern_blocks,
}
}).collect())
})
.collect())
}
fn create_schem_buffer(pattern: &[i32], pro_que: &ProQue) -> ocl::Result<Buffer<i32>> {
@@ -98,6 +127,7 @@ fn create_schem_buffer(pattern: &[i32], pro_que: &ProQue) -> ocl::Result<Buffer<
.queue(pro_que.queue().clone())
.flags(MemFlags::new().read_only())
.len(pattern.len())
// Host Memory Map?
.copy_host_slice(pattern)
.build()
}
}