56#define xt_Pragma(args) _Pragma(args)
57#define XtPragmaACC(args) xt_Pragma(STR(acc args))
59#define XtPragmaACC(args)
72 size_t count, ssize_t *restrict displs,
const uint8_t *restrict src,
73 uint8_t *restrict dst,
enum xt_memtype memtype);
75 size_t count, ssize_t *restrict displs,
const uint16_t *restrict src,
76 uint16_t *restrict dst,
enum xt_memtype memtype);
78 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
79 uint32_t *restrict dst,
enum xt_memtype memtype);
81 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
82 uint32_t *restrict dst,
enum xt_memtype memtype);
84 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
85 uint32_t *restrict dst,
enum xt_memtype memtype);
87 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
88 uint64_t *restrict dst,
enum xt_memtype memtype);
90 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
91 uint64_t *restrict dst,
enum xt_memtype memtype);
93 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
94 uint32_t *restrict dst,
enum xt_memtype memtype);
96 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
97 uint64_t (*restrict dst)[4],
enum xt_memtype memtype);
100 size_t count, ssize_t *restrict displs,
const uint8_t *restrict src,
101 uint8_t *restrict dst,
enum xt_memtype memtype);
103 size_t count, ssize_t *restrict displs,
const uint16_t *restrict src,
104 uint16_t *restrict dst,
enum xt_memtype memtype);
106 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
107 uint32_t *restrict dst,
enum xt_memtype memtype);
109 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[2],
110 uint32_t *restrict dst,
enum xt_memtype memtype);
112 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[3],
113 uint32_t *restrict dst,
enum xt_memtype memtype);
115 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
116 uint64_t *restrict dst,
enum xt_memtype memtype);
118 size_t count, ssize_t *restrict displs,
const uint64_t (*restrict src)[2],
119 uint64_t *restrict dst,
enum xt_memtype memtype);
121 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[5],
122 uint32_t *restrict dst,
enum xt_memtype memtype);
124 size_t count, ssize_t *restrict displs,
const uint64_t (*restrict src)[4],
125 uint64_t *restrict dst,
enum xt_memtype memtype);
132 {.base_pack_size = 2,
136 {.base_pack_size = 4,
140 {.base_pack_size = 4,
144 {.base_pack_size = 8,
148 {.base_pack_size = 4,
152 {.base_pack_size = 8,
156 {.base_pack_size = 4,
160 {.base_pack_size = 8,
179 size_t total_displs_size = 0, count = ddt->
count;
180 for (
size_t i = 0; i < count; ++i)
185 size_t buffer_size = total_displs_size *
sizeof(*displs);
194 for (
size_t i = 0, offset = 0; i < count; ++i) {
195 ddt->
data[i].
displs[memtype] = displs + offset;
202#define add_rhs_byte_displ(rtype,ptr,disp) \
203 ((const rtype *)(const void *)((const unsigned char *)(ptr) + (disp)))
206 size_t count, ssize_t *restrict displs,
const uint8_t *restrict src,
207 uint8_t *restrict dst,
enum xt_memtype memtype) {
212 parallel loop independent deviceptr(src, dst, displs)
214 for (
size_t i = 0; i < count; ++i)
219 size_t count, ssize_t *restrict displs,
const uint16_t *restrict src,
220 uint16_t *restrict dst,
enum xt_memtype memtype) {
225 parallel loop independent deviceptr(src, dst, displs)
227 for (
size_t i = 0; i < count; ++i)
232 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
233 uint32_t *restrict dst,
enum xt_memtype memtype) {
238 parallel loop independent deviceptr(src, dst, displs)
240 for (
size_t i = 0; i < count; ++i)
245 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
246 uint32_t *restrict dst,
enum xt_memtype memtype) {
247 uint32_t (*restrict dst_)[2] = (uint32_t(*)[2])dst;
252 parallel loop independent deviceptr(src, dst_, displs)
254 for (
size_t i = 0; i < count; ++i) {
257 for (
int j = 0; j < 2; ++j) dst_[i][j] = src_32[j];
262 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
263 uint32_t *restrict dst,
enum xt_memtype memtype) {
264 uint32_t (*restrict dst_)[3] = (uint32_t(*)[3])dst;
269 parallel loop independent deviceptr(src, dst_, displs)
271 for (
size_t i = 0; i < count; ++i) {
274 for (
int j = 0; j < 3; ++j) dst_[i][j] = src_32[j];
279 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
280 uint64_t *restrict dst,
enum xt_memtype memtype) {
285 parallel loop independent deviceptr(src, dst, displs)
287 for (
size_t i = 0; i < count; ++i)
292 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
293 uint64_t *restrict dst,
enum xt_memtype memtype) {
294 uint64_t (*restrict dst_)[2] = (uint64_t(*)[2])dst;
299 parallel loop independent deviceptr(src, dst_, displs)
301 for (
size_t i = 0; i < count; ++i) {
304 for (
int j = 0; j < 2; ++j) dst_[i][j] = src_64[j];
309 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
310 uint32_t *restrict dst,
enum xt_memtype memtype) {
311 uint32_t (*restrict dst_)[5] = (uint32_t(*)[5])dst;
316 parallel loop independent deviceptr(src, dst_, displs)
318 for (
size_t i = 0; i < count; ++i) {
321 for (
int j = 0; j < 5; ++j) dst_[i][j] = src_32[j];
326 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
327 uint64_t (*restrict dst)[4],
enum xt_memtype memtype) {
332 parallel loop independent deviceptr(src, dst, displs)
334 for (
size_t i = 0; i < count; ++i) {
337 for (
int j = 0; j < 4; ++j) dst[i][j] = src_64[j];
346 size_t dst_offset = 0;
351 size_t count = ddt->
count;
354 for (
size_t i = 0; i < count; ++i) {
360 displ_count, ddt->
data[i].
displs[memtype], src,
361 (
unsigned char *)dst + dst_offset, memtype);
368void xt_ddt_pack(MPI_Datatype mpi_ddt,
const void *src,
void *dst) {
379#if defined __GNUC__ && __GNUC__ <= 11
385 if (src_memtype != dst_memtype) {
399 if (src_memtype != dst_memtype) {
400 xt_gpu_memcpy(orig_dst, dst, pack_size, dst_memtype, src_memtype);
409 size_t count, ssize_t *restrict displs,
const uint8_t *restrict src,
410 uint8_t *restrict dst,
enum xt_memtype memtype) {
415 parallel loop independent deviceptr(src, dst, displs)
417 for (
size_t i = 0; i < count; ++i)
418 dst[displs[i]] = src[i];
423 size_t count, ssize_t *restrict displs,
const uint16_t *restrict src,
424 uint16_t *restrict dst,
enum xt_memtype memtype) {
429 parallel loop independent deviceptr(src, dst, displs)
431 for (
size_t i = 0; i < count; ++i) {
432 uint16_t *dst_ = (
void *)((
unsigned char *)dst + displs[i]);
438 size_t count, ssize_t *restrict displs,
const uint32_t *restrict src,
439 uint32_t *restrict dst,
enum xt_memtype memtype) {
444 parallel loop independent deviceptr(src, dst, displs)
446 for (
size_t i = 0; i < count; ++i) {
447 uint32_t *dst_ = (
void *)((
unsigned char *)dst + displs[i]);
453 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[2],
454 uint32_t *restrict dst,
enum xt_memtype memtype) {
459 parallel loop independent deviceptr(src, dst, displs)
461 for (
size_t i = 0; i < count; ++i) {
462 uint32_t *dst_32 = (
void *)((
unsigned char *)dst + displs[i]);
463 dst_32[0] = src[i][0];
464 dst_32[1] = src[i][1];
469 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[3],
470 uint32_t *restrict dst,
enum xt_memtype memtype) {
475 parallel loop independent deviceptr(src, dst, displs)
477 for (
size_t i = 0; i < count; ++i) {
478 uint32_t *dst_32 = (
void *)((
unsigned char *)dst + displs[i]);
479 dst_32[0] = src[i][0];
480 dst_32[1] = src[i][1];
481 dst_32[2] = src[i][2];
486 size_t count, ssize_t *restrict displs,
const uint64_t *restrict src,
487 uint64_t *restrict dst,
enum xt_memtype memtype) {
492 parallel loop independent deviceptr(src, dst, displs)
494 for (
size_t i = 0; i < count; ++i) {
495 uint64_t *dst_ = (
void *)((
unsigned char *)dst + displs[i]);
501 size_t count, ssize_t *restrict displs,
const uint64_t (*restrict src)[2],
502 uint64_t *restrict dst,
enum xt_memtype memtype) {
507 parallel loop independent deviceptr(src, dst, displs)
509 for (
size_t i = 0; i < count; ++i) {
510 uint64_t *dst_64 = (
void *)((
unsigned char *)dst + displs[i]);
511 dst_64[0] = src[i][0];
512 dst_64[1] = src[i][1];
517 size_t count, ssize_t *restrict displs,
const uint32_t (*restrict src)[5],
518 uint32_t *restrict dst,
enum xt_memtype memtype) {
523 parallel loop independent deviceptr(src, dst, displs)
525 for (
size_t i = 0; i < count; ++i) {
526 uint32_t *dst_32 = (
void *)((
unsigned char *)dst + displs[i]);
527 dst_32[0] = src[i][0];
528 dst_32[1] = src[i][1];
529 dst_32[2] = src[i][2];
530 dst_32[3] = src[i][3];
531 dst_32[4] = src[i][4];
536 size_t count, ssize_t *restrict displs,
const uint64_t (*restrict src)[4],
537 uint64_t *restrict dst,
enum xt_memtype memtype) {
542 parallel loop independent deviceptr(src, dst, displs)
544 for (
size_t i = 0; i < count; ++i) {
545 uint64_t *dst_64 = (
void *)((
unsigned char *)dst + displs[i]);
546 dst_64[0] = src[i][0];
547 dst_64[1] = src[i][1];
548 dst_64[2] = src[i][2];
549 dst_64[3] = src[i][3];
558 size_t src_offset = 0;
563 size_t count = ddt->
count;
566 for (
size_t i = 0; i < count; ++i) {
591 if (src_memtype != dst_memtype) {
594 xt_gpu_memcpy(src__, src, pack_size, dst_memtype, src_memtype);
607 if (src_memtype != dst_memtype)
xt_gpu_free(src__, dst_memtype);
add versions of standard API functions not returning on error
int displs_available[XT_MEMTYPE_COUNT]
struct xt_ddt_data data[]
ssize_t * displs[XT_MEMTYPE_COUNT]
xt_ddt_kernel_func unpack
static void xt_ddt_unpack_96(size_t count, ssize_t *restrict displs, const uint32_t(*restrict src)[3], uint32_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_unpack_64(size_t count, ssize_t *restrict displs, const uint64_t *restrict src, uint64_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_16(size_t count, ssize_t *restrict displs, const uint16_t *restrict src, uint16_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_256(size_t count, ssize_t *restrict displs, const uint64_t *restrict src, uint64_t(*restrict dst)[4], enum xt_memtype memtype)
static void xt_ddt_pack_32_2(size_t count, ssize_t *restrict displs, const uint32_t *restrict src, uint32_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_160(size_t count, ssize_t *restrict displs, const uint32_t *restrict src, uint32_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_96(size_t count, ssize_t *restrict displs, const uint32_t *restrict src, uint32_t *restrict dst, enum xt_memtype memtype)
void xt_ddt_unpack_internal(Xt_ddt ddt, const void *src, void *dst, enum xt_memtype memtype)
void xt_ddt_unpack(MPI_Datatype mpi_ddt, const void *src, void *dst)
static void xt_ddt_pack_128(size_t count, ssize_t *restrict displs, const uint64_t *restrict src, uint64_t *restrict dst, enum xt_memtype memtype)
size_t xt_ddt_get_pack_size_internal(Xt_ddt ddt)
void xt_ddt_pack_internal(Xt_ddt ddt, const void *src, void *dst, enum xt_memtype memtype)
static void xt_ddt_unpack_32(size_t count, ssize_t *restrict displs, const uint32_t *restrict src, uint32_t *restrict dst, enum xt_memtype memtype)
#define XtPragmaACC(args)
size_t xt_ddt_get_pack_size(MPI_Datatype mpi_ddt)
#define add_rhs_byte_displ(rtype, ptr, disp)
static void xt_ddt_unpack_256(size_t count, ssize_t *restrict displs, const uint64_t(*restrict src)[4], uint64_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_32(size_t count, ssize_t *restrict displs, const uint32_t *restrict src, uint32_t *restrict dst, enum xt_memtype memtype)
struct xt_ddt_kernels xt_ddt_valid_kernels[]
static void xt_ddt_unpack_32_2(size_t count, ssize_t *restrict displs, const uint32_t(*restrict src)[2], uint32_t *restrict dst, enum xt_memtype memtype)
void xt_ddt_pack(MPI_Datatype mpi_ddt, const void *src, void *dst)
static void xt_ddt_pack_64(size_t count, ssize_t *restrict displs, const uint64_t *restrict src, uint64_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_copy_displs(Xt_ddt ddt, enum xt_memtype memtype)
static void xt_ddt_unpack_160(size_t count, ssize_t *restrict displs, const uint32_t(*restrict src)[5], uint32_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_pack_8(size_t count, ssize_t *restrict displs, const uint8_t *restrict src, uint8_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_unpack_8(size_t count, ssize_t *restrict displs, const uint8_t *restrict src, uint8_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_unpack_16(size_t count, ssize_t *restrict displs, const uint16_t *restrict src, uint16_t *restrict dst, enum xt_memtype memtype)
static void xt_ddt_unpack_128(size_t count, ssize_t *restrict displs, const uint64_t(*restrict src)[2], uint64_t *restrict dst, enum xt_memtype memtype)
utility routines for manual handling of MPI DDT's
Xt_ddt xt_ddt_from_mpi_ddt(MPI_Datatype mpi_ddt)
internal utility routines for manual handling of MPI DDT's
void(* xt_ddt_kernel_func)(size_t, ssize_t *, const void *, void *, enum xt_memtype)
void * xt_gpu_malloc(size_t alloc_size, enum xt_memtype memtype)
enum xt_memtype xt_gpu_get_memtype(const void *ptr)
void xt_gpu_memcpy(void *dst, void const *src, size_t buffer_size, enum xt_memtype dst_memtype, enum xt_memtype src_memtype)
void xt_gpu_free(void *ptr, enum xt_memtype memtype)
#define XT_GPU_INSTR_PUSH(arg)
static const idxlist_unpack unpack[]