changa/html/HostCUDA_8h_source.html

 #ifndef _HOST_CUDA_H_

 #define _HOST_CUDA_H_


 #include <cuda_runtime.h>

 #include "cuda_typedef.h"


 #define THREADS_PER_BLOCK 128


 #ifdef GPU_LOCAL_TREE_WALK

 #define THREADS_PER_WARP 32

 #define WARPS_PER_BLOCK (THREADS_PER_BLOCK / THREADS_PER_WARP)

 #define WARP_INDEX (threadIdx.x >> 5)

 #endif //GPU_LOCAL_TREE_WALK


 #ifdef CUDA_2D_TB_KERNEL

 #define PARTS_PER_BLOCK 16

 #define NODES_PER_BLOCK (THREADS_PER_BLOCK/PARTS_PER_BLOCK)


 #define THREADS_PER_BLOCK_PART 128

 #define PARTS_PER_BLOCK_PART 16

 #define NODES_PER_BLOCK_PART (THREADS_PER_BLOCK_PART/PARTS_PER_BLOCK_PART)

 #endif


 // FIXME - find appropriate values

 #define NUM_INIT_MOMENT_INTERACTIONS_PER_BUCKET 100

 #define NUM_INIT_PARTICLE_INTERACTIONS_PER_BUCKET 100


 /* defines for Hybrid API buffer indices */

 #define LOCAL_MOMENTS        0

 #define LOCAL_PARTICLE_CORES  1

 #define LOCAL_PARTICLE_VARS      2

 #define REMOTE_MOMENTS 3

 #define REMOTE_PARTICLE_CORES 4


 #define LOCAL_MOMENTS_IDX        0

 #define LOCAL_PARTICLE_CORES_IDX  1

 #define LOCAL_PARTICLE_VARS_IDX      2

 #define REMOTE_MOMENTS_IDX 0

 #define REMOTE_PARTICLE_CORES_IDX 1


 #define ILPART 0

 #define PART_BUCKET_MARKERS 1

 #define PART_BUCKET_START_MARKERS 2

 #define PART_BUCKET_SIZES 3

 #define ILCELL 0

 #define NODE_BUCKET_MARKERS 1

 #define NODE_BUCKET_START_MARKERS 2

 #define NODE_BUCKET_SIZES 3


 #define ILPART_IDX 0

 #define PART_BUCKET_MARKERS_IDX 1

 #define PART_BUCKET_START_MARKERS_IDX 2

 #define PART_BUCKET_SIZES_IDX 3

 #define ILCELL_IDX 0

 #define NODE_BUCKET_MARKERS_IDX 1

 #define NODE_BUCKET_START_MARKERS_IDX 2

 #define NODE_BUCKET_SIZES_IDX 3


 #define MISSED_MOMENTS 4

 #define MISSED_PARTS 4


 #define MISSED_MOMENTS_IDX 4

 #define MISSED_PARTS_IDX 4


 // node moments, particle cores, particle vars

 #define DM_TRANSFER_LOCAL_NBUFFERS 3

 #define DM_TRANSFER_REMOTE_CHUNK_NBUFFERS 2


 // interaction list

 // list markers

 // bucket starts

 // bucket sizes

 #define TP_GRAVITY_LOCAL_NBUFFERS 4

 #define TP_GRAVITY_LOCAL_NBUFFERS_SMALLPHASE 5


 #define TP_NODE_GRAVITY_REMOTE_NBUFFERS 4

 #define TP_PART_GRAVITY_REMOTE_NBUFFERS 4


 #define TP_NODE_GRAVITY_REMOTE_RESUME_NBUFFERS 5

 #define TP_PART_GRAVITY_REMOTE_RESUME_NBUFFERS 5


 #define MAX_NBUFFERS 5


 // tp_gravity_local uses arrays of particles and nodes already allocated on the gpu

 // tp_gravity_remote uses arrays of nodes already on the gpu + particles from an array it supplies

 // tp_gravity_remote_resume uses an array each of nodes and particles it supplies

 enum kernels {

   DM_TRANSFER_LOCAL=0,

   DM_TRANSFER_REMOTE_CHUNK,

   DM_TRANSFER_BACK,

   DM_TRANSFER_FREE_LOCAL,

   DM_TRANSFER_FREE_REMOTE_CHUNK,

   TP_GRAVITY_LOCAL,

   TP_GRAVITY_REMOTE,

   TP_GRAVITY_REMOTE_RESUME,

   TP_PART_GRAVITY_LOCAL,

   TP_PART_GRAVITY_LOCAL_SMALLPHASE,

   TP_PART_GRAVITY_REMOTE,

   TP_PART_GRAVITY_REMOTE_RESUME,

   EWALD_KERNEL

 };


 typedef struct _CudaRequest{

     void *list;

     int *bucketMarkers;

     int *bucketStarts;

     int *bucketSizes;

     int numInteractions;

     int numBucketsPlusOne;

         void *tp;

         void *missedNodes;

         void *missedParts;

         size_t sMissed;


     int *affectedBuckets;

         void *cb;

         void *state;

         cudatype fperiod;

         // TODO: remove these later if we don't use COSMO_PRINT_BK.

         bool node;

         bool remote;

 #ifdef HAPI_INSTRUMENT_WRS

         int tpIndex;

         char phase;

 #endif

 #ifdef GPU_LOCAL_TREE_WALK

   int firstParticle;

   int lastParticle;

   int rootIdx;

   cosmoType theta;

   cosmoType thetaMono;

   int nReplicas;

   cudatype fperiodY;  // Support periodic boundary condition in more dimensions

   cudatype fperiodZ;  // Support periodic boundary condition in more dimensions

 #endif //GPU_LOCAL_TREE_WALK

 }CudaRequest;


 typedef struct _ParameterStruct{

   int numInteractions;

   int numBucketsPlusOne;

   cudatype fperiod;

 #ifdef GPU_LOCAL_TREE_WALK

   int firstParticle;

   int lastParticle;

   int rootIdx;

   cudatype theta;

   cudatype thetaMono;

   int nReplicas;

   cudatype fperiodY;  // Support periodic boundary condition in more dimensions

   cudatype fperiodZ;  // Support periodic boundary condition in more dimensions

 #endif //GPU_LOCAL_TREE_WALK

 }ParameterStruct;


 void allocatePinnedHostMemory(void **, size_t);

 void freePinnedHostMemory(void *);


 #ifdef HAPI_INSTRUMENT_WRS

 void DataManagerTransferLocalTree(void *moments, size_t sMoments,

                         void *compactParts, size_t sCompactParts,

                         void *varParts, size_t sVarParts,

                         int mype, char phase, void *wrCallback);

 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,

                                     void *compactParts, size_t sCompactParts,

                                     void *varParts, size_t sVarParts,

                                     mype, char phase, void *wrCallback);

 void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart, int pe, char phase);

 void FreeDataManagerRemoteChunkMemory(int , void *, bool freemom, bool freepart, int pe, char phase);

 void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *cb, bool, bool, bool, bool, int pe, char phase);

 #else

 void DataManagerTransferLocalTree(void *moments, size_t sMoments,

                                   void *compactParts, size_t sCompactParts,

                                   void *varParts, size_t sVarParts,

                                   int mype, void *wrCallback);

 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,

                                   void *compactParts, size_t sCompactParts,

                                   void *wrCallback);

 void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart);

 void FreeDataManagerRemoteChunkMemory(int , void *, bool freemom, bool freepart);

 void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *cb,

     bool freemom, bool freepart, bool freeRemoteMom, bool freeRemotePart);

 #endif


 void TreePieceCellListDataTransferLocal(CudaRequest *data);

 void TreePieceCellListDataTransferRemote(CudaRequest *data);

 void TreePieceCellListDataTransferRemoteResume(CudaRequest *data);


 void TreePiecePartListDataTransferLocal(CudaRequest *data);

 void TreePiecePartListDataTransferLocalSmallPhase(CudaRequest *data, CompactPartData *parts, int len);

 void TreePiecePartListDataTransferRemote(CudaRequest *data);

 void TreePiecePartListDataTransferRemoteResume(CudaRequest *data);


 void DummyKernel(void *cb);


 #endif

_CudaRequest::node
bool node
is this a node or particle computation request?
Definition: HostCUDA.h:138

_ParameterStruct::fperiod
cudatype fperiod
Definition: HostCUDA.h:162

_CudaRequest::numBucketsPlusOne
int numBucketsPlusOne
Definition: HostCUDA.h:119

_ParameterStruct::numInteractions
int numInteractions
Definition: HostCUDA.h:159

_CudaRequest::bucketMarkers
int * bucketMarkers
Definition: HostCUDA.h:110

_CudaRequest::sMissed
size_t sMissed
Size of the off-processor data buffer.
Definition: HostCUDA.h:127

VariablePartData
Particle data that gets calculated by the GPU.
Definition: cuda_typedef.h:259

theta
cosmoType theta
BH-like opening criterion.
Definition: ParallelGravity.cpp:142

_CudaRequest::numInteractions
int numInteractions
Definition: HostCUDA.h:117

cudatype
float cudatype
floating point type on the GPU
Definition: cuda_typedef.h:12

_CudaRequest::fperiod
cudatype fperiod
Definition: HostCUDA.h:134

_ParameterStruct
Parameters for the GPU gravity calculations.
Definition: HostCUDA.h:158

_ParameterStruct::numBucketsPlusOne
int numBucketsPlusOne
Definition: HostCUDA.h:160

_CudaRequest::cb
void * cb
Definition: HostCUDA.h:131

_CudaRequest::missedNodes
void * missedNodes
pointer to off-processor Node/Particle buffer.
Definition: HostCUDA.h:124

CompactPartData
Particle data needed on the GPU to calculate gravity.
Definition: cuda_typedef.h:231

_CudaRequest::remote
bool remote
is this a remote or local computation?
Definition: HostCUDA.h:140

_CudaRequest::state
void * state
Definition: HostCUDA.h:132

_CudaRequest::bucketSizes
int * bucketSizes
Definition: HostCUDA.h:115

_CudaRequest
Data and parameters for requesting gravity calculations on the GPU.
Definition: HostCUDA.h:107

_CudaRequest::affectedBuckets
int * affectedBuckets
these buckets were finished in this work request
Definition: HostCUDA.h:130

thetaMono
cosmoType thetaMono
Definition: ParallelGravity.cpp:143

cuda_typedef.h

_CudaRequest::bucketStarts
int * bucketStarts
Definition: HostCUDA.h:113

_CudaRequest::tp
void * tp
Definition: HostCUDA.h:121

_CudaRequest::list
void * list
can either be a ILCell* or an ILPart*
Definition: HostCUDA.h:109