changa  3.5
 All Classes Files Functions Variables Typedefs Enumerations Friends Macros Groups Pages
HostCUDA.h
1 #ifndef _HOST_CUDA_H_
2 #define _HOST_CUDA_H_
3 
4 
5 #include <cuda_runtime.h>
6 #include "cuda_typedef.h"
7 
8 #define THREADS_PER_BLOCK 128
9 
10 #ifdef GPU_LOCAL_TREE_WALK
11 #define THREADS_PER_WARP 32
12 #define WARPS_PER_BLOCK (THREADS_PER_BLOCK / THREADS_PER_WARP)
13 #define WARP_INDEX (threadIdx.x >> 5)
14 #endif //GPU_LOCAL_TREE_WALK
15 
16 #ifdef CUDA_2D_TB_KERNEL
17 #define PARTS_PER_BLOCK 16
18 #define NODES_PER_BLOCK (THREADS_PER_BLOCK/PARTS_PER_BLOCK)
19 
20 #define THREADS_PER_BLOCK_PART 128
21 #define PARTS_PER_BLOCK_PART 16
22 #define NODES_PER_BLOCK_PART (THREADS_PER_BLOCK_PART/PARTS_PER_BLOCK_PART)
23 #endif
24 
25 // FIXME - find appropriate values
26 #define NUM_INIT_MOMENT_INTERACTIONS_PER_BUCKET 100
27 #define NUM_INIT_PARTICLE_INTERACTIONS_PER_BUCKET 100
28 
29 /* defines for Hybrid API buffer indices */
30 #define LOCAL_MOMENTS 0
31 #define LOCAL_PARTICLE_CORES 1
32 #define LOCAL_PARTICLE_VARS 2
33 #define REMOTE_MOMENTS 3
34 #define REMOTE_PARTICLE_CORES 4
35 
36 #define LOCAL_MOMENTS_IDX 0
37 #define LOCAL_PARTICLE_CORES_IDX 1
38 #define LOCAL_PARTICLE_VARS_IDX 2
39 #define REMOTE_MOMENTS_IDX 0
40 #define REMOTE_PARTICLE_CORES_IDX 1
41 
42 #define ILPART 0
43 #define PART_BUCKET_MARKERS 1
44 #define PART_BUCKET_START_MARKERS 2
45 #define PART_BUCKET_SIZES 3
46 #define ILCELL 0
47 #define NODE_BUCKET_MARKERS 1
48 #define NODE_BUCKET_START_MARKERS 2
49 #define NODE_BUCKET_SIZES 3
50 
51 #define ILPART_IDX 0
52 #define PART_BUCKET_MARKERS_IDX 1
53 #define PART_BUCKET_START_MARKERS_IDX 2
54 #define PART_BUCKET_SIZES_IDX 3
55 #define ILCELL_IDX 0
56 #define NODE_BUCKET_MARKERS_IDX 1
57 #define NODE_BUCKET_START_MARKERS_IDX 2
58 #define NODE_BUCKET_SIZES_IDX 3
59 
60 #define MISSED_MOMENTS 4
61 #define MISSED_PARTS 4
62 
63 #define MISSED_MOMENTS_IDX 4
64 #define MISSED_PARTS_IDX 4
65 
66 // node moments, particle cores, particle vars
67 #define DM_TRANSFER_LOCAL_NBUFFERS 3
68 #define DM_TRANSFER_REMOTE_CHUNK_NBUFFERS 2
69 
70 // interaction list
71 // list markers
72 // bucket starts
73 // bucket sizes
74 #define TP_GRAVITY_LOCAL_NBUFFERS 4
75 #define TP_GRAVITY_LOCAL_NBUFFERS_SMALLPHASE 5
76 
77 #define TP_NODE_GRAVITY_REMOTE_NBUFFERS 4
78 #define TP_PART_GRAVITY_REMOTE_NBUFFERS 4
79 
80 #define TP_NODE_GRAVITY_REMOTE_RESUME_NBUFFERS 5
81 #define TP_PART_GRAVITY_REMOTE_RESUME_NBUFFERS 5
82 
83 #define MAX_NBUFFERS 5
84 
85 // tp_gravity_local uses arrays of particles and nodes already allocated on the gpu
86 // tp_gravity_remote uses arrays of nodes already on the gpu + particles from an array it supplies
87 // tp_gravity_remote_resume uses an array each of nodes and particles it supplies
88 enum kernels {
89  DM_TRANSFER_LOCAL=0,
90  DM_TRANSFER_REMOTE_CHUNK,
91  DM_TRANSFER_BACK,
92  DM_TRANSFER_FREE_LOCAL,
93  DM_TRANSFER_FREE_REMOTE_CHUNK,
94  TP_GRAVITY_LOCAL,
95  TP_GRAVITY_REMOTE,
96  TP_GRAVITY_REMOTE_RESUME,
97  TP_PART_GRAVITY_LOCAL,
98  TP_PART_GRAVITY_LOCAL_SMALLPHASE,
99  TP_PART_GRAVITY_REMOTE,
100  TP_PART_GRAVITY_REMOTE_RESUME,
101  EWALD_KERNEL
102 };
103 
104 
107 typedef struct _CudaRequest{
109  void *list;
115  int *bucketSizes;
121  void *tp;
123  void *missedNodes;
125  void *missedParts;
127  size_t sMissed;
128 
131  void *cb;
132  void *state;
136  // TODO: remove these later if we don't use COSMO_PRINT_BK.
138  bool node;
140  bool remote;
141 #ifdef HAPI_INSTRUMENT_WRS
142  int tpIndex;
143  char phase;
144 #endif
145 #ifdef GPU_LOCAL_TREE_WALK
146  int firstParticle;
147  int lastParticle;
148  int rootIdx;
149  cosmoType theta;
150  cosmoType thetaMono;
151  int nReplicas;
152  cudatype fperiodY; // Support periodic boundary condition in more dimensions
153  cudatype fperiodZ; // Support periodic boundary condition in more dimensions
154 #endif //GPU_LOCAL_TREE_WALK
155 }CudaRequest;
156 
158 typedef struct _ParameterStruct{
163 #ifdef GPU_LOCAL_TREE_WALK
164  int firstParticle;
165  int lastParticle;
166  int rootIdx;
167  cudatype theta;
169  int nReplicas;
170  cudatype fperiodY; // Support periodic boundary condition in more dimensions
171  cudatype fperiodZ; // Support periodic boundary condition in more dimensions
172 #endif //GPU_LOCAL_TREE_WALK
174 
175 void allocatePinnedHostMemory(void **, size_t);
176 void freePinnedHostMemory(void *);
177 
178 #ifdef HAPI_INSTRUMENT_WRS
179 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
180  void *compactParts, size_t sCompactParts,
181  void *varParts, size_t sVarParts,
182  int mype, char phase, void *wrCallback);
183 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
184  void *compactParts, size_t sCompactParts,
185  void *varParts, size_t sVarParts,
186  mype, char phase, void *wrCallback);
187 void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart, int pe, char phase);
188 void FreeDataManagerRemoteChunkMemory(int , void *, bool freemom, bool freepart, int pe, char phase);
189 void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *cb, bool, bool, bool, bool, int pe, char phase);
190 #else
191 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
192  void *compactParts, size_t sCompactParts,
193  void *varParts, size_t sVarParts,
194  int mype, void *wrCallback);
195 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
196  void *compactParts, size_t sCompactParts,
197  void *wrCallback);
198 void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart);
199 void FreeDataManagerRemoteChunkMemory(int , void *, bool freemom, bool freepart);
211 void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *cb,
212  bool freemom, bool freepart, bool freeRemoteMom, bool freeRemotePart);
213 #endif
214 
215 void TreePieceCellListDataTransferLocal(CudaRequest *data);
216 void TreePieceCellListDataTransferRemote(CudaRequest *data);
217 void TreePieceCellListDataTransferRemoteResume(CudaRequest *data);
218 
219 
220 void TreePiecePartListDataTransferLocal(CudaRequest *data);
221 void TreePiecePartListDataTransferLocalSmallPhase(CudaRequest *data, CompactPartData *parts, int len);
222 void TreePiecePartListDataTransferRemote(CudaRequest *data);
223 void TreePiecePartListDataTransferRemoteResume(CudaRequest *data);
224 
225 void DummyKernel(void *cb);
226 
227 #endif
bool node
is this a node or particle computation request?
Definition: HostCUDA.h:138
cudatype fperiod
Definition: HostCUDA.h:162
int numBucketsPlusOne
Definition: HostCUDA.h:119
int numInteractions
Definition: HostCUDA.h:159
int * bucketMarkers
Definition: HostCUDA.h:110
size_t sMissed
Size of the off-processor data buffer.
Definition: HostCUDA.h:127
Particle data that gets calculated by the GPU.
Definition: cuda_typedef.h:259
cosmoType theta
BH-like opening criterion.
Definition: ParallelGravity.cpp:142
int numInteractions
Definition: HostCUDA.h:117
float cudatype
floating point type on the GPU
Definition: cuda_typedef.h:12
cudatype fperiod
Definition: HostCUDA.h:134
Parameters for the GPU gravity calculations.
Definition: HostCUDA.h:158
int numBucketsPlusOne
Definition: HostCUDA.h:160
void * cb
Definition: HostCUDA.h:131
void * missedNodes
pointer to off-processor Node/Particle buffer.
Definition: HostCUDA.h:124
Particle data needed on the GPU to calculate gravity.
Definition: cuda_typedef.h:231
bool remote
is this a remote or local computation?
Definition: HostCUDA.h:140
void * state
Definition: HostCUDA.h:132
int * bucketSizes
Definition: HostCUDA.h:115
Data and parameters for requesting gravity calculations on the GPU.
Definition: HostCUDA.h:107
int * affectedBuckets
these buckets were finished in this work request
Definition: HostCUDA.h:130
cosmoType thetaMono
Definition: ParallelGravity.cpp:143
int * bucketStarts
Definition: HostCUDA.h:113
void * tp
Definition: HostCUDA.h:121
void * list
can either be a ILCell* or an ILPart*
Definition: HostCUDA.h:109