-
Notifications
You must be signed in to change notification settings - Fork 45
[CRL] Add unified reduction kernel implementation and uniRunner for demo 2-rank AllReduce #315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
9fdadc7 to
027e885
Compare
|
|
||
| // (6) set completion flag | ||
| __syncthreads(); | ||
| FLAGCX_DEVICE_THREAD_FENCE(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move FLAGCX_DEVICE_THREAD_FENCE() after t->setComplete()
70003ed to
642accd
Compare
…hile loop in RedKernel
b3d8a81 to
425bb29
Compare
flagcx/core/transport.cc
Outdated
| #define ENABLE_TIMER 0 | ||
| #include "timer.h" | ||
|
|
||
| FLAGCX_PARAM(RunUniRunnerAllReduce, "RUN_UNIRUNNER_ALLREDUCE", 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FLAGCX_P2P_DISABLE?
| } | ||
|
|
||
| FLAGCX_HOST_DECORATOR uint64_t flagcxReduceTrigger::pollState() { | ||
| uint64_t curr_val = __atomic_load_n(&this->value[3], __ATOMIC_ACQUIRE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
curr_val -> currVal
|
|
||
| FLAGCX_HOST_DECORATOR void flagcxReduceTrigger::setState(int state) { | ||
| uint64_t curr_val = __atomic_load_n(&this->value[3], __ATOMIC_ACQUIRE); | ||
| curr_val &= ~(flagcxTriggerMask(flagcxReduceTriggerBitsState) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above
flagcx/core/transport.cc
Outdated
| FLAGCX_PARAM(P2pDisable, "P2P_DISABLE", 0); | ||
|
|
||
| static inline bool isSameNode(struct flagcxHeteroComm *comm, int peer) { | ||
| // force use network transport for unirunner allreduce |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// force use net transport
| FLAGCX_DEVICE_INLINE_DECORATOR flagcxResult_t dequeue(volatile uint64_t *buffer, | ||
| int *idx) { | ||
| while (true) { | ||
| unsigned long long int old_c = *(buffer + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
old_c -> oldConsumed
cur_p -> curProduced
| uint64_t nthreads, uint64_t datatype, uint64_t redOp) { | ||
| // to be implemented by vendors | ||
| int tid = threadIdx.x; | ||
| float *fst_ptr = (float *)fst; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fst_ptr -> fstPtr
| float *snd_ptr = (float *)snd; | ||
| float *out_ptr = (float *)out; | ||
| for (int i = tid; i < count; i += nthreads) { | ||
| out_ptr[i] = fst_ptr[i] + snd_ptr[i]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above
|
|
||
| FLAGCX_GLOBAL_DECORATOR void flagcxCollectiveKernel(void *fifoBuffer) { | ||
| volatile uint64_t *vBuf = (volatile uint64_t *)fifoBuffer; | ||
| int empty_iter = 0; // backoff counter |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
empty_iter -> emptyIter
| uint64_t redop; | ||
| int slot = myIdx & (*vBuf - 1); | ||
| if (tid == 0) { | ||
| // printf("block %d get work idx %d, slot %d\n", blockIdx.x, myIdx, slot); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this line
| t->setComplete(); | ||
| } | ||
| } | ||
| // FLAGCX_DEVICE_THREAD_FENCE(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this line
MC952-arch
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
mikethegoblin
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
PR Category
CRL
PR Types
New Features
PR Description