Skip to content

Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

@johndpope

Description

@johndpope

I forgot about this code - I got claude.ai to spit it out some weeks back in about 5 mins
I don't know if it works / compiles - so maybe garbage. I was captivated by the 10+ hrs of youtube videos - and frankly - I'm not sure if this is exactly what you wanted.

Regardless - I beseech you to look at Claude Opus as a vector to getting hacking results (not gpt4)
It's well abreast on the AMD firmware / drivers / ALL github projects (including tinygrad).

#include "helpers.h"
#include "nouveau.h"
#include "ROCT-Thunk-Interface.h"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"

#define ROCHSA_PM4_QUEUE_SIZE (64*1024) // 64 KB

uint64_t trivial[] = {
// Trivial compute shader, same as original
0x00005a00ff057624, 0x000fe200078e00ff,

0x0000580000027a02, 0x000fe20000000f00,
0x0000590000037a02, 0x000fca0000000f00,
0x0000000502007986, 0x000fe2000c101904,
0x000000000000794d, 0x000fea0003800000,
};

void gpu_setup(PM4Queue* pQueue) {
// Initialize the PM4 queue
pQueue->Init();
}

void gpu_memcpy(PM4Queue* pQueue, uint64_t dst, const uint32_t *src, int len) {
assert(len % 4 == 0);

// Use PM4 DMA packet to do the memcpy

pQueue->PlaceAndSubmitPacket(PM4DmaDataPacket(dst, src, len));
}

void gpu_compute(PM4Queue* pQueue, uint64_t shader_addr, uint64_t cb_addr, int cb_len) {

// Set up registers
const unsigned int COMPUTE_PGM_VALUES[] = {
static_cast<uint32_t>(shader_addr),       // PGM_LO
static_cast<uint32_t>(shader_addr >> 32) // PGM_HI
};

const unsigned int COMPUTE_PGM_RSRC1[] = { 0x000c0084 }; // Same as original

const unsigned int COMPUTE_DISPATCH_DIMENSIONS[] = {
1, 1, 1, // THREADS_X/Y/Z
1, 1, 1, // GROUPS_X/Y/Z

0, 0     // PIPELINESTAT/PERFCOUNT
};

const unsigned int COMPUTE_USER_DATA[] = {
static_cast<uint32_t>(cb_addr),       // CB1_BASE_LO
static_cast<uint32_t>(cb_addr >> 32), // CB1_BASE_HI

cb_len,                               // CB1_SIZE
1                                     // CB1_VALID
};

// Configure shader registers
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO, COMPUTE_PGM_VALUES,
sizeof(COMPUTE_PGM_VALUES)/sizeof(COMPUTE_PGM_VALUES[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC1,
sizeof(COMPUTE_PGM_RSRC1)/sizeof(COMPUTE_PGM_RSRC1[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_NUM_THREAD_X, COMPUTE_DISPATCH_DIMENSIONS,
sizeof(COMPUTE_DISPATCH_DIMENSIONS)/sizeof(COMPUTE_DISPATCH_DIMENSIONS[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA,
sizeof(COMPUTE_USER_DATA)/sizeof(COMPUTE_USER_DATA[0])));

// Dispatch the compute shader

pQueue->PlaceAndSubmitPacket(PM4DispatchDirectPacket(1, 1, 1));

// Wait for shader completion
pQueue->PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(true, cb_addr, 0xC0FFEE));
pQueue->Wait4PacketConsumption();
}

int main() {

PM4Queue queue;
HsaMemoryBuffer isaBuf(trivial, sizeof(trivial), PAGE_SIZE, false);

// Map and initialize GPU resources
void* gpu_mmio_ptr = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, open("/dev/mem", O_RDWR), 0);

uint64_t gpu_local_mem = 0; // Allocate with hsaKmtAllocMemory()
uint64_t cb_gpu_addr = gpu_local_mem;

// Set up the queue
gpu_setup(&queue);

// Copy shader code to GPU memory

gpu_memcpy(&queue, gpu_local_mem, trivial, sizeof(trivial));

// Run the shader
gpu_compute(&queue, gpu_local_mem, cb_gpu_addr, 16);

// Clean up
munmap(gpu_mmio_ptr, PAGE_SIZE);
hsaKmtFreeMemory(gpu_local_mem, sizeGpuMem);

return 0;
}

geohot/cuda_ioctl_sniffer#5

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions