Highly Efficient FFT for Exascale: HeFFTe v2.4
Loading...
Searching...
No Matches
heffte_reshape3d.h
1/*
2 -- heFFTe --
3 Univ. of Tennessee, Knoxville
4 @date
5*/
6
7#ifndef HEFFTE_RESHAPE3D_H
8#define HEFFTE_RESHAPE3D_H
9
10#include "heffte_plan_logic.h"
11#include "heffte_backends.h"
12
29namespace heffte {
30
31#ifdef Heffte_ENABLE_CUDA
32namespace gpu { using namespace cuda; }
33#else
34#ifdef Heffte_ENABLE_ROCM
35namespace gpu { using namespace rocm; }
36#endif
37#ifdef Heffte_ENABLE_ONEAPI
38namespace gpu { using namespace oneapi; }
39#endif
40#endif
41
51template<typename index>
52void compute_overlap_map_transpose_pack(int me, int nprocs, box3d<index> const destination, std::vector<box3d<index>> const &boxes,
53 std::vector<int> &proc, std::vector<int> &offset, std::vector<int> &sizes, std::vector<pack_plan_3d<index>> &plans);
54
59template<typename index>
61public:
63 reshape3d_base(index cinput_size, index coutput_size) : input_size(cinput_size), output_size(coutput_size){};
65 virtual ~reshape3d_base() = default;
67 virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const = 0;
69 virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const = 0;
71 virtual void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const = 0;
73 virtual void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const = 0;
74
76 index size_intput() const{ return input_size; }
78 index size_output() const{ return output_size; }
80 virtual size_t size_workspace() const{ return input_size + output_size; }
81
82protected:
84 index const input_size;
86 index const output_size;
87
88 // buffers to be used in the no-gpu-aware algorithm for the temporary cpu storage
89 // the no-gpu-aware version alleviate the latency when working with small FFTs
90 // hence the cpu buffers will be small and will not cause issues
91 // note that the main API accepts a GPU buffer for scratch work and cannot be used here
93 template<typename scalar_type> scalar_type* cpu_send_buffer(size_t num_entries) const{
94 size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);
95 send_unaware.resize(float_entries);
96 return reinterpret_cast<scalar_type*>(send_unaware.data());
97 }
99 template<typename scalar_type> scalar_type* cpu_recv_buffer(size_t num_entries) const{
100 size_t float_entries = num_entries * sizeof(scalar_type) / sizeof(float);
101 recv_unaware.resize(float_entries);
102 return reinterpret_cast<scalar_type*>(recv_unaware.data());
103 }
105 mutable std::vector<float> send_unaware;
107 mutable std::vector<float> recv_unaware;
108};
109
114template<typename index>
115inline size_t get_workspace_size(std::array<std::unique_ptr<reshape3d_base<index>>, 4> const &shapers){
116 size_t max_size = 0;
117 for(auto const &s : shapers) if (s) max_size = std::max(max_size, s->size_workspace());
118 return max_size;
119}
120
132template<typename location_tag, template<typename device> class packer, typename index>
133class reshape3d_alltoall : public reshape3d_base<index>, public backend::device_instance<location_tag>{
134public:
138 template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoall<b, p, i>>
139 make_reshape3d_alltoall(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);
140
142 void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
143 apply_base(batch_size, source, destination, workspace);
144 }
146 void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
147 apply_base(batch_size, source, destination, workspace);
148 }
150 void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
151 apply_base(batch_size, source, destination, workspace);
152 }
154 void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
155 apply_base(batch_size, source, destination, workspace);
156 }
157
159 template<typename scalar_type>
160 void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
161
163 size_t size_workspace() const override { return 2 * num_entries * packplan.size(); }
164
165private:
170 int input_size, int output_size, bool gpu_aware, MPI_Comm ccomm,
171 std::vector<pack_plan_3d<index>>&&, std::vector<pack_plan_3d<index>>&&,
172 std::vector<int>&&, std::vector<int>&&, int);
173
174 MPI_Comm const comm;
175 int const me, nprocs;
176 bool const use_gpu_aware;
177
178 std::vector<pack_plan_3d<index>> packplan, unpackplan;
179 std::vector<int> send_offset, recv_offset;
180 int const num_entries;
181};
182
205template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
206std::unique_ptr<reshape3d_alltoall<location_tag, packer, index>>
208 std::vector<box3d<index>> const &input_boxes, std::vector<box3d<index>> const &output_boxes,
209 bool uses_gpu_aware, MPI_Comm const comm);
210
225template<typename location_tag, template<typename device> class packer, typename index>
226class reshape3d_alltoallv : public reshape3d_base<index>, public backend::device_instance<location_tag>{
227public:
231 template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_alltoallv<b, p, i>>
232 make_reshape3d_alltoallv(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, bool, MPI_Comm const);
233
235 void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
236 apply_base(batch_size, source, destination, workspace);
237 }
239 void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
240 apply_base(batch_size, source, destination, workspace);
241 }
243 void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
244 apply_base(batch_size, source, destination, workspace);
245 }
247 void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
248 apply_base(batch_size, source, destination, workspace);
249 }
250
252 template<typename scalar_type>
253 void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
254
255private:
260 int input_size, int output_size,
261 bool gpu_aware, MPI_Comm new_comm, std::vector<int> const &pgroup,
262 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> const &send_proc,
263 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> const &recv_proc,
264 std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);
265
266 MPI_Comm const comm;
267 int const me, nprocs;
268 bool const use_gpu_aware;
269
270 std::vector<int> const send_offset; // extraction loc for each send
271 std::vector<int> const send_size; // size of each send message
272 std::vector<int> const recv_offset; // insertion loc for each recv
273 std::vector<int> const recv_size; // size of each recv message
274 int const send_total, recv_total;
275
276 std::vector<pack_plan_3d<index>> const packplan, unpackplan;
277
278 struct iotripple{
279 std::vector<int> counts, displacements, map;
280 iotripple(std::vector<int> const &pgroup, std::vector<int> const &proc, std::vector<int> const &sizes) :
281 counts(pgroup.size(), 0), displacements(pgroup.size(), 0), map(pgroup.size(), -1)
282 {
283 int offset = 0;
284 for(size_t src = 0; src < pgroup.size(); src++){
285 for(size_t i=0; i<proc.size(); i++){
286 if (proc[i] != pgroup[src]) continue;
287 counts[src] = sizes[i];
288 displacements[src] = offset;
289 offset += sizes[i];
290 map[src] = i;
291 }
292 }
293 }
294
295 };
296
297 iotripple const send, recv;
298};
299
322template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
323std::unique_ptr<reshape3d_alltoallv<location_tag, packer, index>>
325 std::vector<box3d<index>> const &input_boxes,
326 std::vector<box3d<index>> const &output_boxes,
327 bool use_gpu_aware,
328 MPI_Comm const comm);
329
340template<typename location_tag, template<typename device> class packer, typename index>
341class reshape3d_pointtopoint : public reshape3d_base<index>, public backend::device_instance<location_tag>{
342public:
346 template<typename b, template<typename d> class p, typename i> friend std::unique_ptr<reshape3d_pointtopoint<b, p, i>>
347 make_reshape3d_pointtopoint(typename backend::device_instance<b>::stream_type, std::vector<box3d<i>> const&, std::vector<box3d<i>> const&, reshape_algorithm, bool, MPI_Comm const);
348
350 void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
351 apply_base(batch_size, source, destination, workspace);
352 }
354 void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
355 apply_base(batch_size, source, destination, workspace);
356 }
358 void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
359 apply_base(batch_size, source, destination, workspace);
360 }
362 void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
363 apply_base(batch_size, source, destination, workspace);
364 }
365
367 template<typename scalar_type>
368 void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
369
371 template<typename scalar_type>
372 void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const;
373
374private:
379 int input_size, int output_size, reshape_algorithm alg, bool gpu_aware, MPI_Comm ccomm,
380 std::vector<int> &&send_offset, std::vector<int> &&send_size, std::vector<int> &&send_proc,
381 std::vector<int> &&recv_offset, std::vector<int> &&recv_size, std::vector<int> &&recv_proc,
382 std::vector<int> &&recv_loc,
383 std::vector<pack_plan_3d<index>> &&packplan, std::vector<pack_plan_3d<index>> &&unpackplan);
384
385 MPI_Comm const comm;
386 int const me, nprocs;
387 bool const self_to_self;
388 reshape_algorithm const algorithm;
389 bool const use_gpu_aware;
390 mutable std::vector<MPI_Request> requests; // recv_proc.size() requests, but remove one if using self_to_self communication
391 mutable std::vector<MPI_Request> isends;
392
393 std::vector<int> const send_proc; // processor to send towards
394 std::vector<int> const send_offset; // extraction loc for each send
395 std::vector<int> const send_size; // size of each send message
396 std::vector<int> const recv_proc; // processor to receive from
397 std::vector<int> const recv_offset; // insertion loc for each recv
398 std::vector<int> const recv_size; // size of each recv message
399 std::vector<int> const recv_loc; // offset in the receive buffer (recv_offset refers to the the destination buffer)
400 int const send_total, recv_total;
401
402 std::vector<pack_plan_3d<index>> const packplan, unpackplan;
403 int max_send_size;
404};
405
429template<typename location_tag, template<typename device> class packer = direct_packer, typename index>
430std::unique_ptr<reshape3d_pointtopoint<location_tag, packer, index>>
432 std::vector<box3d<index>> const &input_boxes,
433 std::vector<box3d<index>> const &output_boxes,
434 reshape_algorithm algorithm, bool use_gpu_aware,
435 MPI_Comm const comm);
436
443template<typename location_tag, typename index>
444class reshape3d_transpose : public reshape3d_base<index>, public backend::device_instance<location_tag>{
445public:
448 pack_plan_3d<index> const cplan) :
449 reshape3d_base<index>(cplan.size[0] * cplan.size[1] * cplan.size[2], cplan.size[0] * cplan.size[1] * cplan.size[2]),
450 backend::device_instance<location_tag>(q),
451 plan(cplan)
452 {}
453
455 void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final{
456 transpose(batch_size, source, destination, workspace);
457 }
459 void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final{
460 transpose(batch_size, source, destination, workspace);
461 }
463 void apply(int batch_size, std::complex<float> const source[], std::complex<float> destination[], std::complex<float> workspace[]) const override final{
464 transpose(batch_size, source, destination, workspace);
465 }
467 void apply(int batch_size, std::complex<double> const source[], std::complex<double> destination[], std::complex<double> workspace[]) const override final{
468 transpose(batch_size, source, destination, workspace);
469 }
470
471private:
472 template<typename scalar_type>
473 void transpose(int batch_size, scalar_type const *source, scalar_type *destination, scalar_type *workspace) const{
474 if (source == destination){ // in-place transpose will need workspace
475 backend::data_manipulator<location_tag>::copy_n(this->stream(), source, batch_size * this->input_size, workspace);
476 for(int j=0; j<batch_size; j++)
477 transpose_packer<location_tag>().unpack(this->stream(), plan, workspace + j * this->input_size,
478 destination + j * this->input_size);
479 }else{
480 for(int j=0; j<batch_size; j++)
481 transpose_packer<location_tag>().unpack(this->stream(), plan, source + j * this->input_size,
482 destination + j * this->input_size);
483 }
484 }
485
486 pack_plan_3d<index> const plan;
487};
488
504template<typename backend_tag, typename index>
505std::unique_ptr<reshape3d_base<index>> make_reshape3d(typename backend::device_instance<typename backend::buffer_traits<backend_tag>::location>::stream_type stream,
506 std::vector<box3d<index>> const &input_boxes,
507 std::vector<box3d<index>> const &output_boxes,
508 MPI_Comm const comm,
509 plan_options const options){
510 using location_tag = typename backend::buffer_traits<backend_tag>::location;
511
512 if (match(input_boxes, output_boxes)){
513 if (input_boxes[0].ordered_same_as(output_boxes[0])){
514 return std::unique_ptr<reshape3d_base<index>>();
515 }else{
516 int const me = mpi::comm_rank(comm);
517 std::vector<int> proc, offset, sizes;
518 std::vector<pack_plan_3d<index>> plans;
519
520 compute_overlap_map_transpose_pack(0, 1, output_boxes[me], {input_boxes[me]}, proc, offset, sizes, plans);
521
522 if (not plans.empty()){
523 return std::unique_ptr<reshape3d_base<index>>(new reshape3d_transpose<location_tag, index >(stream, plans[0]));
524 }else{
525 // when the number of indexes is very small, the current box can be empty
526 return std::unique_ptr<reshape3d_base<index>>();
527 }
528 }
529 }else{
531 if (input_boxes[0].ordered_same_as(output_boxes[0])){
532 return make_reshape3d_alltoallv<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
533 options.use_gpu_aware, comm);
534 }else{
535 return make_reshape3d_alltoallv<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
536 options.use_gpu_aware, comm);
537 }
538 }else if (options.algorithm == reshape_algorithm::alltoall){
539 if (input_boxes[0].ordered_same_as(output_boxes[0])){
540 return make_reshape3d_alltoall<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
541 options.use_gpu_aware, comm);
542 }else{
543 return make_reshape3d_alltoall<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
544 options.use_gpu_aware, comm);
545 }
546 }else{
547 if (input_boxes[0].ordered_same_as(output_boxes[0])){
548 return make_reshape3d_pointtopoint<location_tag, direct_packer, index>(stream, input_boxes, output_boxes,
549 options.algorithm, options.use_gpu_aware, comm);
550 }else{
551 return make_reshape3d_pointtopoint<location_tag, transpose_packer, index>(stream, input_boxes, output_boxes,
552 options.algorithm, options.use_gpu_aware, comm);
553 }
554 }
555 }
556}
557
558}
559
560#endif
Reshape algorithm based on the MPI_Alltoall() method.
Definition heffte_reshape3d.h:133
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:154
~reshape3d_alltoall()
Destructor, frees the comm generated by the constructor.
Definition heffte_reshape3d.h:136
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:150
size_t size_workspace() const override
The size of the workspace must include padding.
Definition heffte_reshape3d.h:163
friend std::unique_ptr< reshape3d_alltoall< b, p, i > > make_reshape3d_alltoall(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:142
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:146
Reshape algorithm based on the MPI_Alltoallv() method.
Definition heffte_reshape3d.h:226
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:247
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:243
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:239
~reshape3d_alltoallv()
Destructor, frees the comm generated by the constructor.
Definition heffte_reshape3d.h:229
friend std::unique_ptr< reshape3d_alltoallv< b, p, i > > make_reshape3d_alltoallv(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_alltoallv::apply() algorithm for all scalar types.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:235
Base reshape interface.
Definition heffte_reshape3d.h:60
reshape3d_base(index cinput_size, index coutput_size)
Constructor that sets the input and output sizes.
Definition heffte_reshape3d.h:63
index const output_size
Stores the size of the output.
Definition heffte_reshape3d.h:86
virtual void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const =0
Apply the reshape, double precision complex.
virtual ~reshape3d_base()=default
Default virtual destructor.
index size_intput() const
Returns the input size.
Definition heffte_reshape3d.h:76
scalar_type * cpu_send_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition heffte_reshape3d.h:93
virtual void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const =0
Apply the reshape, single precision complex.
virtual void apply(int batch_size, double const source[], double destination[], double workspace[]) const =0
Apply the reshape, double precision.
index const input_size
Stores the size of the input.
Definition heffte_reshape3d.h:84
index size_output() const
Returns the output size.
Definition heffte_reshape3d.h:78
virtual void apply(int batch_size, float const source[], float destination[], float workspace[]) const =0
Apply the reshape, single precision.
scalar_type * cpu_recv_buffer(size_t num_entries) const
Allocates and returns a CPU buffer when GPU-Aware communication has been disabled.
Definition heffte_reshape3d.h:99
std::vector< float > send_unaware
Temp buffers for the gpu-unaware algorithms.
Definition heffte_reshape3d.h:105
virtual size_t size_workspace() const
Returns the workspace size.
Definition heffte_reshape3d.h:80
std::vector< float > recv_unaware
Temp buffers for the gpu-unaware algorithms.
Definition heffte_reshape3d.h:107
Reshape algorithm based on the MPI_Send() and MPI_Irecv() methods.
Definition heffte_reshape3d.h:341
~reshape3d_pointtopoint()=default
Destructor, frees the comm generated by the constructor.
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:350
void no_gpuaware_send_recv(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm that does not use GPU-Aware MPI.
void apply_base(int batch_size, scalar_type const source[], scalar_type destination[], scalar_type workspace[]) const
Templated reshape3d_pointtopoint::apply() algorithm for all scalar types.
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:358
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:362
friend std::unique_ptr< reshape3d_pointtopoint< b, p, i > > make_reshape3d_pointtopoint(typename backend::device_instance< b >::stream_type, std::vector< box3d< i > > const &, std::vector< box3d< i > > const &, reshape_algorithm, bool, MPI_Comm const)
Factory method, use to construct instances of the class.
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:354
Special case of the reshape that does not involve MPI communication but applies a transpose instead.
Definition heffte_reshape3d.h:444
reshape3d_transpose(typename backend::device_instance< location_tag >::stream_type q, pack_plan_3d< index > const cplan)
Constructor using the provided unpack plan.
Definition heffte_reshape3d.h:447
void apply(int batch_size, std::complex< double > const source[], std::complex< double > destination[], std::complex< double > workspace[]) const override final
Apply the reshape operations, double precision complex overload.
Definition heffte_reshape3d.h:467
void apply(int batch_size, float const source[], float destination[], float workspace[]) const override final
Apply the reshape operations, single precision overload.
Definition heffte_reshape3d.h:455
void apply(int batch_size, std::complex< float > const source[], std::complex< float > destination[], std::complex< float > workspace[]) const override final
Apply the reshape operations, single precision complex overload.
Definition heffte_reshape3d.h:463
void apply(int batch_size, double const source[], double destination[], double workspace[]) const override final
Apply the reshape operations, double precision overload.
Definition heffte_reshape3d.h:459
reshape_algorithm
Defines list of potential communication algorithms.
Definition heffte_plan_logic.h:48
@ alltoall
Using the MPI_Alltoall options, with padding on the data.
@ alltoallv
Using the MPI_Alltoallv options, no padding on the data (default option).
bool match(std::vector< box3d< index > > const &shape0, std::vector< box3d< index > > const &shape1)
Compares two vectors of boxes, returns true if all boxes match.
Definition heffte_geometry.h:246
int comm_rank(MPI_Comm const comm)
Returns the rank of this process within the specified comm.
Definition heffte_utils.h:79
void comm_free(MPI_Comm const comm)
Calls free on the MPI comm.
Definition heffte_utils.h:175
size_t get_workspace_size(std::array< std::unique_ptr< reshape3d_base< index > >, 4 > const &shapers)
Returns the maximum workspace size used by the shapers.
Definition heffte_reshape3d.h:115
std::unique_ptr< reshape3d_pointtopoint< location_tag, packer, index > > make_reshape3d_pointtopoint(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, reshape_algorithm algorithm, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoall< location_tag, packer, index > > make_reshape3d_alltoall(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, bool uses_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_alltoallv< location_tag, packer, index > > make_reshape3d_alltoallv(typename backend::device_instance< location_tag >::stream_type q, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, bool use_gpu_aware, MPI_Comm const comm)
Factory method that all the necessary work to establish the communication patterns.
std::unique_ptr< reshape3d_base< index > > make_reshape3d(typename backend::device_instance< typename backend::buffer_traits< backend_tag >::location >::stream_type stream, std::vector< box3d< index > > const &input_boxes, std::vector< box3d< index > > const &output_boxes, MPI_Comm const comm, plan_options const options)
Factory method to create a reshape3d instance.
Definition heffte_reshape3d.h:505
void compute_overlap_map_transpose_pack(int me, int nprocs, box3d< index > const destination, std::vector< box3d< index > > const &boxes, std::vector< int > &proc, std::vector< int > &offset, std::vector< int > &sizes, std::vector< pack_plan_3d< index > > &plans)
Generates an unpack plan where the boxes and the destination do not have the same order.
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition heffte_common.h:420
device_instance(void *=nullptr)
Definition heffte_common.h:410
void * stream()
Definition heffte_common.h:414
A generic container that describes a 3d box of indexes.
Definition heffte_geometry.h:67
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
Holds the plan for a pack/unpack operation.
Definition heffte_pack3d.h:32
Defines a set of tweaks and options to use in the plan generation.
Definition heffte_plan_logic.h:131
reshape_algorithm algorithm
Defines the communication algorithm.
Definition heffte_plan_logic.h:148
bool use_gpu_aware
Defines whether to use MPI calls directly from the GPU or to move to the CPU first.
Definition heffte_plan_logic.h:152
Indicates the use of cpu backend and that all input/output data and arrays will be bound to the cpu.
Definition heffte_common.h:38
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition heffte_pack3d.h:116