-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
I forgot about this code - I got claude.ai to spit it out some weeks back in about 5 mins
I don't know if it works / compiles - so maybe garbage. I was captivated by the 10+ hrs of youtube videos - and frankly - I'm not sure if this is exactly what you wanted.
Regardless - I beseech you to look at Claude Opus as a vector to getting hacking results (not gpt4)
It's well abreast on the AMD firmware / drivers / ALL github projects (including tinygrad).
#include "helpers.h"
#include "nouveau.h"
#include "ROCT-Thunk-Interface.h"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#define ROCHSA_PM4_QUEUE_SIZE (64*1024) // 64 KB
uint64_t trivial[] = {
// Trivial compute shader, same as original
0x00005a00ff057624, 0x000fe200078e00ff,
0x0000580000027a02, 0x000fe20000000f00,
0x0000590000037a02, 0x000fca0000000f00,
0x0000000502007986, 0x000fe2000c101904,
0x000000000000794d, 0x000fea0003800000,
};
void gpu_setup(PM4Queue* pQueue) {
// Initialize the PM4 queue
pQueue->Init();
}
void gpu_memcpy(PM4Queue* pQueue, uint64_t dst, const uint32_t *src, int len) {
assert(len % 4 == 0);
// Use PM4 DMA packet to do the memcpy
pQueue->PlaceAndSubmitPacket(PM4DmaDataPacket(dst, src, len));
}
void gpu_compute(PM4Queue* pQueue, uint64_t shader_addr, uint64_t cb_addr, int cb_len) {
// Set up registers
const unsigned int COMPUTE_PGM_VALUES[] = {
static_cast<uint32_t>(shader_addr), // PGM_LO
static_cast<uint32_t>(shader_addr >> 32) // PGM_HI
};
const unsigned int COMPUTE_PGM_RSRC1[] = { 0x000c0084 }; // Same as original
const unsigned int COMPUTE_DISPATCH_DIMENSIONS[] = {
1, 1, 1, // THREADS_X/Y/Z
1, 1, 1, // GROUPS_X/Y/Z
0, 0 // PIPELINESTAT/PERFCOUNT
};
const unsigned int COMPUTE_USER_DATA[] = {
static_cast<uint32_t>(cb_addr), // CB1_BASE_LO
static_cast<uint32_t>(cb_addr >> 32), // CB1_BASE_HI
cb_len, // CB1_SIZE
1 // CB1_VALID
};
// Configure shader registers
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO, COMPUTE_PGM_VALUES,
sizeof(COMPUTE_PGM_VALUES)/sizeof(COMPUTE_PGM_VALUES[0])));
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC1,
sizeof(COMPUTE_PGM_RSRC1)/sizeof(COMPUTE_PGM_RSRC1[0])));
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_NUM_THREAD_X, COMPUTE_DISPATCH_DIMENSIONS,
sizeof(COMPUTE_DISPATCH_DIMENSIONS)/sizeof(COMPUTE_DISPATCH_DIMENSIONS[0])));
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA,
sizeof(COMPUTE_USER_DATA)/sizeof(COMPUTE_USER_DATA[0])));
// Dispatch the compute shader
pQueue->PlaceAndSubmitPacket(PM4DispatchDirectPacket(1, 1, 1));
// Wait for shader completion
pQueue->PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(true, cb_addr, 0xC0FFEE));
pQueue->Wait4PacketConsumption();
}
int main() {
PM4Queue queue;
HsaMemoryBuffer isaBuf(trivial, sizeof(trivial), PAGE_SIZE, false);
// Map and initialize GPU resources
void* gpu_mmio_ptr = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, open("/dev/mem", O_RDWR), 0);
uint64_t gpu_local_mem = 0; // Allocate with hsaKmtAllocMemory()
uint64_t cb_gpu_addr = gpu_local_mem;
// Set up the queue
gpu_setup(&queue);
// Copy shader code to GPU memory
gpu_memcpy(&queue, gpu_local_mem, trivial, sizeof(trivial));
// Run the shader
gpu_compute(&queue, gpu_local_mem, cb_gpu_addr, 16);
// Clean up
munmap(gpu_mmio_ptr, PAGE_SIZE);
hsaKmtFreeMemory(gpu_local_mem, sizeGpuMem);
return 0;
}Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels