5 #include <cuda_runtime.h>
8 #define THREADS_PER_BLOCK 128
10 #ifdef GPU_LOCAL_TREE_WALK
11 #define THREADS_PER_WARP 32
12 #define WARPS_PER_BLOCK (THREADS_PER_BLOCK / THREADS_PER_WARP)
13 #define WARP_INDEX (threadIdx.x >> 5)
14 #endif //GPU_LOCAL_TREE_WALK
16 #ifdef CUDA_2D_TB_KERNEL
17 #define PARTS_PER_BLOCK 16
18 #define NODES_PER_BLOCK (THREADS_PER_BLOCK/PARTS_PER_BLOCK)
20 #define THREADS_PER_BLOCK_PART 128
21 #define PARTS_PER_BLOCK_PART 16
22 #define NODES_PER_BLOCK_PART (THREADS_PER_BLOCK_PART/PARTS_PER_BLOCK_PART)
26 #define NUM_INIT_MOMENT_INTERACTIONS_PER_BUCKET 100
27 #define NUM_INIT_PARTICLE_INTERACTIONS_PER_BUCKET 100
30 #define LOCAL_MOMENTS 0
31 #define LOCAL_PARTICLE_CORES 1
32 #define LOCAL_PARTICLE_VARS 2
33 #define REMOTE_MOMENTS 3
34 #define REMOTE_PARTICLE_CORES 4
36 #define LOCAL_MOMENTS_IDX 0
37 #define LOCAL_PARTICLE_CORES_IDX 1
38 #define LOCAL_PARTICLE_VARS_IDX 2
39 #define REMOTE_MOMENTS_IDX 0
40 #define REMOTE_PARTICLE_CORES_IDX 1
43 #define PART_BUCKET_MARKERS 1
44 #define PART_BUCKET_START_MARKERS 2
45 #define PART_BUCKET_SIZES 3
47 #define NODE_BUCKET_MARKERS 1
48 #define NODE_BUCKET_START_MARKERS 2
49 #define NODE_BUCKET_SIZES 3
52 #define PART_BUCKET_MARKERS_IDX 1
53 #define PART_BUCKET_START_MARKERS_IDX 2
54 #define PART_BUCKET_SIZES_IDX 3
56 #define NODE_BUCKET_MARKERS_IDX 1
57 #define NODE_BUCKET_START_MARKERS_IDX 2
58 #define NODE_BUCKET_SIZES_IDX 3
60 #define MISSED_MOMENTS 4
61 #define MISSED_PARTS 4
63 #define MISSED_MOMENTS_IDX 4
64 #define MISSED_PARTS_IDX 4
67 #define DM_TRANSFER_LOCAL_NBUFFERS 3
68 #define DM_TRANSFER_REMOTE_CHUNK_NBUFFERS 2
74 #define TP_GRAVITY_LOCAL_NBUFFERS 4
75 #define TP_GRAVITY_LOCAL_NBUFFERS_SMALLPHASE 5
77 #define TP_NODE_GRAVITY_REMOTE_NBUFFERS 4
78 #define TP_PART_GRAVITY_REMOTE_NBUFFERS 4
80 #define TP_NODE_GRAVITY_REMOTE_RESUME_NBUFFERS 5
81 #define TP_PART_GRAVITY_REMOTE_RESUME_NBUFFERS 5
83 #define MAX_NBUFFERS 5
90 DM_TRANSFER_REMOTE_CHUNK,
92 DM_TRANSFER_FREE_LOCAL,
93 DM_TRANSFER_FREE_REMOTE_CHUNK,
96 TP_GRAVITY_REMOTE_RESUME,
97 TP_PART_GRAVITY_LOCAL,
98 TP_PART_GRAVITY_LOCAL_SMALLPHASE,
99 TP_PART_GRAVITY_REMOTE,
100 TP_PART_GRAVITY_REMOTE_RESUME,
141 #ifdef HAPI_INSTRUMENT_WRS
145 #ifdef GPU_LOCAL_TREE_WALK
154 #endif //GPU_LOCAL_TREE_WALK
163 #ifdef GPU_LOCAL_TREE_WALK
172 #endif //GPU_LOCAL_TREE_WALK
175 void allocatePinnedHostMemory(
void **,
size_t);
176 void freePinnedHostMemory(
void *);
178 #ifdef HAPI_INSTRUMENT_WRS
179 void DataManagerTransferLocalTree(
void *moments,
size_t sMoments,
180 void *compactParts,
size_t sCompactParts,
181 void *varParts,
size_t sVarParts,
182 int mype,
char phase,
void *wrCallback);
183 void DataManagerTransferRemoteChunk(
void *moments,
size_t sMoments,
184 void *compactParts,
size_t sCompactParts,
185 void *varParts,
size_t sVarParts,
186 mype,
char phase,
void *wrCallback);
187 void FreeDataManagerLocalTreeMemory(
bool freemom,
bool freepart,
int pe,
char phase);
188 void FreeDataManagerRemoteChunkMemory(
int ,
void *,
bool freemom,
bool freepart,
int pe,
char phase);
189 void TransferParticleVarsBack(
VariablePartData *hostBuffer,
size_t size,
void *cb,
bool,
bool,
bool,
bool,
int pe,
char phase);
191 void DataManagerTransferLocalTree(
void *moments,
size_t sMoments,
192 void *compactParts,
size_t sCompactParts,
193 void *varParts,
size_t sVarParts,
194 int mype,
void *wrCallback);
195 void DataManagerTransferRemoteChunk(
void *moments,
size_t sMoments,
196 void *compactParts,
size_t sCompactParts,
198 void FreeDataManagerLocalTreeMemory(
bool freemom,
bool freepart);
199 void FreeDataManagerRemoteChunkMemory(
int ,
void *,
bool freemom,
bool freepart);
211 void TransferParticleVarsBack(
VariablePartData *hostBuffer,
size_t size,
void *cb,
212 bool freemom,
bool freepart,
bool freeRemoteMom,
bool freeRemotePart);
215 void TreePieceCellListDataTransferLocal(
CudaRequest *data);
216 void TreePieceCellListDataTransferRemote(
CudaRequest *data);
217 void TreePieceCellListDataTransferRemoteResume(
CudaRequest *data);
220 void TreePiecePartListDataTransferLocal(
CudaRequest *data);
222 void TreePiecePartListDataTransferRemote(
CudaRequest *data);
223 void TreePiecePartListDataTransferRemoteResume(
CudaRequest *data);
225 void DummyKernel(
void *cb);
bool node
is this a node or particle computation request?
Definition: HostCUDA.h:138
cudatype fperiod
Definition: HostCUDA.h:162
int numBucketsPlusOne
Definition: HostCUDA.h:119
int numInteractions
Definition: HostCUDA.h:159
int * bucketMarkers
Definition: HostCUDA.h:110
size_t sMissed
Size of the off-processor data buffer.
Definition: HostCUDA.h:127
Particle data that gets calculated by the GPU.
Definition: cuda_typedef.h:259
cosmoType theta
BH-like opening criterion.
Definition: ParallelGravity.cpp:142
int numInteractions
Definition: HostCUDA.h:117
float cudatype
floating point type on the GPU
Definition: cuda_typedef.h:12
cudatype fperiod
Definition: HostCUDA.h:134
Parameters for the GPU gravity calculations.
Definition: HostCUDA.h:158
int numBucketsPlusOne
Definition: HostCUDA.h:160
void * cb
Definition: HostCUDA.h:131
void * missedNodes
pointer to off-processor Node/Particle buffer.
Definition: HostCUDA.h:124
Particle data needed on the GPU to calculate gravity.
Definition: cuda_typedef.h:231
bool remote
is this a remote or local computation?
Definition: HostCUDA.h:140
void * state
Definition: HostCUDA.h:132
int * bucketSizes
Definition: HostCUDA.h:115
Data and parameters for requesting gravity calculations on the GPU.
Definition: HostCUDA.h:107
int * affectedBuckets
these buckets were finished in this work request
Definition: HostCUDA.h:130
cosmoType thetaMono
Definition: ParallelGravity.cpp:143
int * bucketStarts
Definition: HostCUDA.h:113
void * tp
Definition: HostCUDA.h:121
void * list
can either be a ILCell* or an ILPart*
Definition: HostCUDA.h:109