diff --git a/.dockerignore b/.dockerignore index 814e90a9..3f83d039 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,4 +12,5 @@ helm-charts .editorconfig .idea coverage* -# adopted from https://bun.sh/guides/ecosystem/docker \ No newline at end of file +# adopted from https://bun.sh/guides/ecosystem/docker +.env.development diff --git a/.env.development.ignore b/.env.development.ignore new file mode 120000 index 00000000..76bd22af --- /dev/null +++ b/.env.development.ignore @@ -0,0 +1 @@ +/home/philip/repos/kth-jobb/devenv/.env.development \ No newline at end of file diff --git a/.github/workflows/ts-format-validate.yml b/.github/workflows/ts-format-validate.yml index 04303c0a..b616dacd 100644 --- a/.github/workflows/ts-format-validate.yml +++ b/.github/workflows/ts-format-validate.yml @@ -7,17 +7,17 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Setup Bun - uses: oven-sh/setup-bun@v1 + - name: Setup Bun + uses: oven-sh/setup-bun@v1 - - name: Install Prettier - run: bun install + - name: Install Prettier + run: bun install - - name: Check for formatting errors - run: bun run format-check - - - name: Check for TS errors - run: bun run tsc + - name: Check for formatting errors + run: bun run format-check + + - name: Check for TS errors + run: bun run tsc --noEmit diff --git a/.gitignore b/.gitignore index 86c7bf2e..32a8605f 100644 --- a/.gitignore +++ b/.gitignore @@ -25,9 +25,10 @@ yarn-error.log* # .env.dev +.env.development # ide .vscode # npm package-lock.json (we use Bun) -package-lock.json \ No newline at end of file +package-lock.json diff --git a/bun.lockb b/bun.lockb index 29e9ecf8..33c2acc0 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index b8ba10cd..1a5309f1 100644 --- a/package.json +++ b/package.json @@ -9,70 +9,72 @@ "dev": "vite --port 3000", "build": "vite build", "format": "prettier --write .", - "format-check": "prettier --check ." + "format-check": "prettier --check .", + "check": "prettier --check . && tsc --noEmit" }, "dependencies": { - "@emotion/react": "^11.11.4", - "@emotion/styled": "^11.11.5", + "@emotion/react": "^11.14.0", + "@emotion/styled": "^11.14.1", "@iconify/react": "^4.1.1", - "@kthcloud/go-deploy-types": "^1.0.24", - "@mui/icons-material": "^5.15.20", - "@mui/lab": "^5.0.0-alpha.170", - "@mui/material": "^5.15.20", + "@kthcloud/go-deploy-types": "^1.0.25", + "@mui/icons-material": "^5.18.0", + "@mui/lab": "^5.0.0-alpha.177", + "@mui/material": "^5.18.0", "@mui/material-next": "^6.0.0-alpha.126", - "@mui/x-tree-view": "^7.7.0", + "@mui/x-tree-view": "^7.29.10", "@react-keycloak/web": "^3.4.0", - "@react-three/fiber": "^8.16.8", + "@react-three/fiber": "^8.18.0", "@sanity/eventsource": "^5.0.2", "@types/crypto-js": "^4.2.2", + "@types/lodash": "^4.17.23", "@types/numeral": "^2.0.5", "@types/punycode": "^2.1.4", "@types/three": "^0.164.1", - "apexcharts": "^3.49.1", - "bun": "^1.2.22", + "apexcharts": "^3.54.1", + "bun": "^1.3.9", "change-case": "^5.4.4", "crypto-js": "^4.2.0", "http-status-codes": "^2.3.0", - "i18next": "^23.11.5", - "i18next-browser-languagedetector": "^7.2.1", - "js-base64": "^3.7.7", + "i18next": "^23.16.8", + "i18next-browser-languagedetector": "^7.2.2", + "js-base64": "^3.7.8", "keycloak-js": "^24.0.5", - "lodash": "^4.17.21", + "lodash": "^4.17.23", "million": "latest", - "notistack": "^3.0.1", + "notistack": "^3.0.2", "numeral": "^2.0.6", "punycode": "^2.3.1", "react": "^18.3.1", - "react-apexcharts": "^1.4.1", - "react-cookie": "^7.1.4", + "react-apexcharts": "^1.9.0", + "react-cookie": "^7.2.2", "react-copy-to-clipboard": "^5.1.0", "react-dom": "^18.3.1", "react-helmet-async": "^2.0.5", - "react-i18next": "^14.1.2", - "react-router-dom": "^6.23.1", - "simplebar": "^6.2.7", - "simplebar-react": "^3.2.6", + "react-i18next": "^14.1.3", + "react-router-dom": "^6.30.3", + "simplebar": "^6.3.3", + "simplebar-react": "^3.3.2", "three": "^0.164.1", - "three-stdlib": "^2.30.3", - "yaml": "^2.4.5" + "three-stdlib": "^2.36.1", + "yaml": "^2.8.2" }, "devDependencies": { "@faker-js/faker": "^8.4.1", - "@types/bun": "^1.1.17", - "@types/react": "^18.3.3", + "@types/bun": "^1.3.9", + "@types/react": "^18.3.28", "@types/react-copy-to-clipboard": "^5.0.7", - "@types/react-dom": "^18.3.0", - "@typescript-eslint/eslint-plugin": "^7.13.0", - "@typescript-eslint/parser": "^7.13.0", - "@vitejs/plugin-react": "^4.3.1", - "@vitejs/plugin-react-swc": "^3.7.0", - "eslint": "^9.35.0", - "eslint-plugin-react": "^7.34.2", + "@types/react-dom": "^18.3.7", + "@typescript-eslint/eslint-plugin": "^7.18.0", + "@typescript-eslint/parser": "^7.18.0", + "@vitejs/plugin-react": "^4.7.0", + "@vitejs/plugin-react-swc": "^3.11.0", + "eslint": "^9.39.2", + "eslint-plugin-react": "^7.37.5", "eslint-plugin-react-hooks": "^4.6.2", - "eslint-plugin-react-refresh": "^0.4.7", - "prettier": "^3.6.2", + "eslint-plugin-react-refresh": "^0.4.26", + "prettier": "^3.8.1", "prettier-plugin-nginx": "^1.0.3", - "typescript": "^5.4.5", - "vite": "^7.1.6" + "typescript": "^5.9.3", + "vite": "^7.3.1" } } diff --git a/src/api/deploy/gpuClaims.ts b/src/api/deploy/gpuClaims.ts new file mode 100644 index 00000000..e0c4c33d --- /dev/null +++ b/src/api/deploy/gpuClaims.ts @@ -0,0 +1,73 @@ +import { GpuClaimCreate } from "../../temporaryTypesRemoveMe"; +import { Jwt } from "../../types"; + +export const listGpuClaims = async (token: Jwt, detailed?: boolean) => { + const detailedQuery = detailed + ? `?detailed=${encodeURIComponent(detailed)}` + : ""; + const url = `${import.meta.env.VITE_DEPLOY_API_URL}/gpuClaims${detailedQuery}`; + const response = await fetch(url, { + method: "GET", + headers: { + Authorization: `Bearer ${token}`, + }, + }); + + const result = await response.json(); + + if (!Array.isArray(result)) { + throw new Error("Error listing GPU claims, response was not an array"); + } + + result.sort((a: any, b: any) => { + return a.id < b.id ? -1 : 1; + }); + + return result; +}; + +export const getGpuClaim = async (token: Jwt, gpuClaimId: string) => { + const url = `${import.meta.env.VITE_DEPLOY_API_URL}/gpuClaims/${gpuClaimId}`; + const response = await fetch(url, { + method: "GET", + headers: { + Authorization: `Bearer ${token}`, + }, + }); + const result = await response.json(); + if (typeof result !== "object") { + throw new Error("Error getting GPU claim, response was not an object"); + } + return result; +}; + +export const deleteGpuClaim = async (token: Jwt, gpuClaimId: string) => { + const url = `${import.meta.env.VITE_DEPLOY_API_URL}/gpuClaims/${gpuClaimId}`; + const response = await fetch(url, { + method: "DELETE", + headers: { + Authorization: `Bearer ${token}`, + }, + }); + const result = await response.json(); + if (typeof result !== "object") { + throw new Error("Error deleting GPU claim, response was not an object"); + } + return result; +}; + +export const createGpuClaim = async (token: Jwt, gpuClaim: GpuClaimCreate) => { + const url = `${import.meta.env.VITE_DEPLOY_API_URL}/gpuClaims`; + const response = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + }, + body: JSON.stringify(gpuClaim), + }); + const result = await response.json(); + if (typeof result !== "object") { + throw new Error("Error creating GPU claim, response was not an object"); + } + return result; +}; diff --git a/src/components/admin/ClusterOverviewTab.tsx b/src/components/admin/ClusterOverviewTab.tsx new file mode 100644 index 00000000..1011fda7 --- /dev/null +++ b/src/components/admin/ClusterOverviewTab.tsx @@ -0,0 +1,80 @@ +import { + Paper, + Stack, + Typography, + Card, + CardContent, + Chip, + Divider, +} from "@mui/material"; +import useResource from "../../hooks/useResource"; +import { useTranslation } from "react-i18next"; +import DRAConfigPanel from "./DRAConfigPanel"; +import { useEffect, useState } from "react"; +import { discover } from "../../api/deploy/discover"; + +export default function ClusterOverviewTab() { + const { t } = useTranslation(); + const { zones } = useResource(); + const [roles, setRoles] = useState([]); + + useEffect(() => { + discover().then((resp) => { + setRoles([...new Set([...resp.roles.map((r) => r.name), "admin"])]); + }); + }, []); + + return ( + + + {t("clusters-overview")} + + {zones?.map((zone) => { + const hasDRA = zone.capabilities?.includes("dra"); + + return ( + + + + {/* Header */} + + + {zone.name} + + + {zone.description} + + + + {/* Capabilities */} + + {zone.capabilities.map((cap) => ( + + ))} + + + {/* Optional DRA section */} + {hasDRA && ( + <> + + + + )} + + + + ); + })} + + + ); +} diff --git a/src/components/admin/DRAConfigPanel.tsx b/src/components/admin/DRAConfigPanel.tsx new file mode 100644 index 00000000..acf707d4 --- /dev/null +++ b/src/components/admin/DRAConfigPanel.tsx @@ -0,0 +1,110 @@ +import { useState } from "react"; +import { Stack, Typography, Button, Alert, Box } from "@mui/material"; +import GpuClaimModal from "./GPUClaimModal"; +import { GpuClaimCreate } from "../../temporaryTypesRemoveMe"; +import { createGpuClaim } from "../../api/deploy/gpuClaims"; +import { useKeycloak } from "@react-keycloak/web"; +import { enqueueSnackbar } from "notistack"; +import useAdmin from "../../hooks/useAdmin"; + +interface DRAConfigPanelProps { + zone: { + name: string; + // extend later with draResources, policies, etc + }; + roles: string[]; +} + +export default function DRAConfigPanel({ zone, roles }: DRAConfigPanelProps) { + const [gpuModalOpen, setGpuModalOpen] = useState(false); + const { gpuClaims } = useAdmin(); + const { keycloak } = useKeycloak(); + + const handleAddGpuClaim = async (claim: GpuClaimCreate) => { + try { + const response = await createGpuClaim(keycloak.token!, claim); + if (response["validationErrors"] != undefined) { + throw response["validationErrors"]; + } + setGpuModalOpen(false); + } catch (ex) { + enqueueSnackbar<"error">({ message: "Failed to create gpu claim" + ex }); + } + }; + + return ( + + + Dynamic Resource Allocation (DRA) + + + + This zone supports Dynamic Resource Allocation. Configure DRA resources + and policies below. + + + + + {gpuClaims?.length === 0 ? ( + + No DRA resources configured yet. + + ) : ( + + {gpuClaims?.map((claim) => ( + + {claim.name} + + Zone: {claim.zone} + + + Allowed roles: {claim.allowedRoles?.join(", ") || "All"} + + + Requested GPUs:{" "} + {Object.keys(claim.requested || {}).join(", ")} + + + ))} + + )} + + + + + + + + {/* GPU Claim Modal */} + setGpuModalOpen(false)} + onSubmit={handleAddGpuClaim} + /> + + ); +} diff --git a/src/components/admin/GPUClaimEditor.tsx b/src/components/admin/GPUClaimEditor.tsx new file mode 100644 index 00000000..a1b400ac --- /dev/null +++ b/src/components/admin/GPUClaimEditor.tsx @@ -0,0 +1,272 @@ +import { + Stack, + TextField, + Button, + MenuItem, + IconButton, + Paper, + Autocomplete, + Typography, + Chip, +} from "@mui/material"; +import DeleteIcon from "@mui/icons-material/Delete"; +import { + GenericDeviceConfiguration, + RequestedGpuCreate, +} from "../../temporaryTypesRemoveMe"; + +interface Props { + value: RequestedGpuCreate[]; + onChange: (value: RequestedGpuCreate[]) => void; +} + +export default function GpuClaimEditor({ value, onChange }: Props) { + const update = (idx: number, patch: Partial) => { + const next = [...value]; + next[idx] = { ...next[idx], ...patch }; + onChange(next); + }; + + const updateRequested = (idx: number, patch: Partial) => { + const next = [...value]; + next[idx] = { + ...next[idx], + ...patch, + }; + onChange(next); + }; + + return ( + + {value.map((req, idx) => { + const gpu = req; + + return ( + + + {/* Header */} + + update(idx, { name: e.target.value })} + required + fullWidth + /> + + onChange(value.filter((_, i) => i !== idx))} + > + + + + + + updateRequested(idx, { + allocationMode: e.target.value, + }) + } + > + All + Exact count + + + {gpu.allocationMode === "ExactCount" && ( + + updateRequested(idx, { + count: Number(e.target.value), + }) + } + /> + )} + + + updateRequested(idx, { + deviceClassName: e.target.value, + }) + } + placeholder="nvidia.com/gpu" + defaultValue={"nvidia.com/gpu"} + helperText="RFC1123 name" + required + /> + + {/* TODO: CEL expr for selectors*/} + {/* { + updateRequested(idx, { + selectors: sel, + }); + }} + /> */} + + + {"Driver configuration"} + + updateRequested(idx, { + config: { driver: e.target.value }, + }) + } + /> + {(gpu.config as GenericDeviceConfiguration)?.driver == + "gpu.nvidia.com" && ( + <> + + updateRequested(idx, { + config: { + driver: + (gpu.config as GenericDeviceConfiguration) + ?.driver || "gpu.nvidia.com", + parameters: { + ...(gpu.config as any)?.parameters, + sharing: { + ...(gpu.config as any)?.parameters?.sharing, + strategy: strat, + }, + }, + }, + }) + } + renderTags={(s, getTagProps) => + s.map((strategy, index) => ( + + )) + } + renderInput={(params) => ( + + )} + /> + + <> + {(gpu.config as any)?.parameters?.sharing?.strategy == + "MPS" ? ( + + + MPS Configuration + + + + updateRequested(idx, { + config: { + driver: "gpu.nvidia.com", + parameters: { + ...(gpu.config as any)?.parameters, + sharing: { + ...(gpu.config as any)?.parameters + ?.sharing, + mps: { + ...(gpu.config as any)?.parameters + ?.sharing?.mps, + defaultActiveThreadPercentage: Number( + e.target.value + ), + }, + }, + }, + }, + }) + } + /> + + + updateRequested(idx, { + config: { + driver: "gpu.nvidia.com", + parameters: { + ...(gpu.config as any)?.parameters, + sharing: { + ...(gpu.config as any)?.parameters + ?.sharing, + mps: { + ...(gpu.config as any)?.parameters + ?.sharing?.mps, + defaultPinnedDeviceMemoryLimit: + e.target.value, + }, + }, + }, + }, + }) + } + /> + + ) : ( + (gpu.config as any)?.parameters?.sharing?.strategy == + "TimeSlicing" && ( + timeslicing TBD + ) + )} + + + )} + + + + ); + })} + + + + ); +} diff --git a/src/components/admin/GPUClaimModal.tsx b/src/components/admin/GPUClaimModal.tsx new file mode 100644 index 00000000..dc92f0cd --- /dev/null +++ b/src/components/admin/GPUClaimModal.tsx @@ -0,0 +1,203 @@ +import { + Dialog, + DialogTitle, + DialogContent, + DialogActions, + Stack, + TextField, + Button, + Chip, + Autocomplete, + Tooltip, + Typography, + useTheme, +} from "@mui/material"; +import { useState } from "react"; +import GpuClaimEditor from "./GPUClaimEditor"; +import { GpuClaimCreate } from "../../temporaryTypesRemoveMe"; +import Iconify from "../Iconify"; +import { useTranslation } from "react-i18next"; + +interface Props { + open: boolean; + zone: string; + roles: string[]; + initialValue?: GpuClaimCreate; + onClose: () => void; + onSubmit: (value: GpuClaimCreate) => void; +} + +type Role = + | "default" + | "bronze" + | "silver" + | "gold" + | "platinum" + | "admin" + | string; + +function getChipColor(role: Role) { + const metalStyles: Record = { + bronze: { + background: + "linear-gradient(145deg, #cd7f32 0%, #b06a2f 50%, #d99c6c 100%)", + color: "#000", + boxShadow: "inset 0 1px 2px rgba(255,255,255,0.3)", + }, + silver: { + background: + "linear-gradient(145deg, #e6e8eb 0%, #c0c0c0 50%, #f5f5f5 100%)", + color: "#000", + boxShadow: "inset 0 1px 2px rgba(255,255,255,0.5)", + }, + gold: { + background: + "linear-gradient(145deg, #ffd700 0%, #e6c200 50%, #ffea70 100%)", + color: "#000", + boxShadow: "inset 0 1px 2px rgba(255,255,255,0.4)", + }, + platinum: { + background: + "linear-gradient(145deg, #e5e4e2 0%, #cfcfcf 50%, #ffffff 100%)", + color: "#000", + boxShadow: "inset 0 1px 2px rgba(255,255,255,0.5)", + }, + }; + + const muiColors: Record< + string, + | "default" + | "error" + | "primary" + | "secondary" + | "info" + | "success" + | "warning" + > = { + default: "default", + admin: "error", + }; + + if (metalStyles[role]) { + return { + sx: { + background: metalStyles[role].background, + color: metalStyles[role].color, + boxShadow: metalStyles[role].boxShadow, + }, + }; + } + + return { color: muiColors[role] ?? "default" }; +} + +export default function GpuClaimModal({ + open, + zone, + roles, + initialValue, + onClose, + onSubmit, +}: Props) { + const [value, setValue] = useState( + initialValue ?? { + name: "", + zone, + allowedRoles: [], + requested: [], + } + ); + const { t } = useTranslation(); + const theme = useTheme(); + + return ( + + + {initialValue ? t("edit-gpu-claim") : t("create-gpu-claim")} + + + + + setValue({ ...value, name: e.target.value })} + required + /> + + + + + setValue({ + ...value, + allowedRoles: Array.isArray(roles) ? roles : [roles], + }) + } + renderTags={(r, getTagProps) => + r.map((role, index) => ( + + )) + } + renderInput={(params) => ( + + + + {"These roles will be allowed to use the GPUClaim."} + +

+ + {"If left empty, anyone will be allowed to use it."} + +

+ + { + 'The admin "role" will make sure that only users that are admin can use it.' + } + + + } + > + + + +
+ +
+ )} + /> + + setValue({ ...value, requested })} + /> +
+
+ + + + + +
+ ); +} diff --git a/src/components/admin/TimeAgo.tsx b/src/components/admin/TimeAgo.tsx index 789e47aa..23ab85f3 100644 --- a/src/components/admin/TimeAgo.tsx +++ b/src/components/admin/TimeAgo.tsx @@ -1,10 +1,13 @@ import React, { useState, useEffect } from "react"; import { Typography } from "@mui/material"; +import { Variant } from "@mui/material/styles/createTypography"; -const TimeAgo: React.FC<{ createdAt: string | undefined }> = ({ - createdAt, -}) => { +const TimeAgo: React.FC<{ + createdAt: string | undefined; + variant?: Variant | undefined; +}> = ({ createdAt, variant }) => { const [timeAgo, setTimeAgo] = useState(""); + variant = variant != undefined ? variant : "body2"; const calculateTimeAgo = (createdAt: string) => { const now = new Date().getTime(); @@ -44,7 +47,7 @@ const TimeAgo: React.FC<{ createdAt: string | undefined }> = ({ } }, [createdAt]); - return {timeAgo}; + return {timeAgo}; }; export default TimeAgo; diff --git a/src/components/chart/BaseOptionChart.tsx b/src/components/chart/BaseOptionChart.tsx index 68e17d43..55c2ed69 100644 --- a/src/components/chart/BaseOptionChart.tsx +++ b/src/components/chart/BaseOptionChart.tsx @@ -172,6 +172,7 @@ export default function BaseOptionChart(): ApexOptions { position: "top", horizontalAlign: "right", markers: { + //@ts-ignore idk if this exists or not. radius: 12, }, fontWeight: 500, diff --git a/src/contexts/AdminResourceContext.tsx b/src/contexts/AdminResourceContext.tsx index 9062d55c..636c1446 100644 --- a/src/contexts/AdminResourceContext.tsx +++ b/src/contexts/AdminResourceContext.tsx @@ -35,6 +35,8 @@ import useResource from "../hooks/useResource"; import { TFunction } from "i18next"; import { getHostsVerbose } from "../api/deploy/hosts"; import { getSystemCapacities } from "../api/deploy/systemCapacities"; +import { GpuClaimRead } from "../temporaryTypesRemoveMe"; +import { listGpuClaims } from "../api/deploy/gpuClaims"; type AdminResourceContextType = { fetchingEnabled: boolean; @@ -118,6 +120,9 @@ type AdminResourceContextType = { // SystemCapacities systemCapacities: SystemCapacities | undefined; + + // GpuClaims + gpuClaims: GpuClaimRead[] | undefined; }; const initialState: AdminResourceContextType = { @@ -202,6 +207,9 @@ const initialState: AdminResourceContextType = { // SystemCapacities systemCapacities: undefined, + + // GpuClaims + gpuClaims: undefined, }; export const AdminResourceContext = createContext(initialState); @@ -295,6 +303,10 @@ export const AdminResourceContextProvider = ({ SystemCapacities | undefined >(undefined); + const [gpuClaims, setGpuClaims] = useState( + undefined + ); + const [lastRefreshRtt, setLastRefreshRtt] = useState(0); const [lastRefresh, setLastRefresh] = useState(0); const [loading, setLoading] = useState(false); @@ -336,7 +348,8 @@ export const AdminResourceContextProvider = ({ jobsPageSize, setJobs, setHosts, - setSystemCapacities + setSystemCapacities, + setGpuClaims ).finally(() => setLoading(false)); } }; @@ -458,6 +471,9 @@ export const AdminResourceContextProvider = ({ // SystemCapacities systemCapacities, + + // GpuClaims + gpuClaims, }} > {children} @@ -511,7 +527,10 @@ async function fetchResources( setHosts: Dispatch>, // SystemCapacities - setSystemCapacities: Dispatch> + setSystemCapacities: Dispatch>, + + // GpuClaims + setGpuClaims: Dispatch> ) { if (!(initialized && keycloak.authenticated && keycloak.token)) return; const rtts: Record = {}; @@ -656,6 +675,21 @@ async function fetchResources( ); } }, + + async () => { + try { + const start = performance.now(); + const response = await listGpuClaims(keycloak.token!, true); + rtts[9] = { start, end: performance.now() }; + if (response) setGpuClaims(response); + } catch (error: any) { + errorHandler(error).forEach((e) => + enqueueSnackbar(t("error-could-not-fetch-gpu-claims") + ": " + e, { + variant: "error", + }) + ); + } + }, ]; await Promise.all(promises.map((p) => p())); diff --git a/src/contexts/ResourceContext.tsx b/src/contexts/ResourceContext.tsx index 089494e0..73927f17 100644 --- a/src/contexts/ResourceContext.tsx +++ b/src/contexts/ResourceContext.tsx @@ -28,6 +28,8 @@ import { import { listGpuGroups } from "../api/deploy/gpuGroups"; import { listGpuLeases } from "../api/deploy/gpuLeases"; import { listMigrations } from "../api/deploy/resourceMigrations"; +import { GpuClaimRead } from "../temporaryTypesRemoveMe"; +import { listGpuClaims } from "../api/deploy/gpuClaims"; type ResourceContextType = { rows: Resource[]; @@ -52,6 +54,7 @@ type ResourceContextType = { setGpuGroups: (gpuGroups: GpuGroupRead[]) => void; gpuLeases: GpuLeaseRead[]; setGpuLeases: (gpuLeases: GpuLeaseRead[]) => void; + gpuClaims: GpuClaimRead[] | undefined; resourceMigrations: ResourceMigrationRead[]; setResourceMigrations: (resourceMigrations: ResourceMigrationRead[]) => void; queueJob: (job: Job) => void; @@ -87,6 +90,7 @@ const initialState: ResourceContextType = { setGpuGroups: () => {}, gpuLeases: new Array(), setGpuLeases: () => {}, + gpuClaims: new Array(), resourceMigrations: new Array(), setResourceMigrations: () => {}, queueJob: () => {}, @@ -124,6 +128,7 @@ export const ResourceContextProvider = ({ const [zones, setZones] = useState([]); const [gpuGroups, setGpuGroups] = useState([]); const [gpuLeases, setGpuLeases] = useState([]); + const [gpuClaims, setGpuClaims] = useState([]); const [resourceMigrations, setResourceMigrations] = useState< ResourceMigrationRead[] >([]); @@ -253,6 +258,20 @@ export const ResourceContextProvider = ({ } }; + const loadGpuClaims = async () => { + if (!(initialized && keycloak.authenticated && keycloak.token)) return; + try { + const gpuClaims = await listGpuClaims(keycloak.token); + setGpuClaims(gpuClaims); + } catch (error: any) { + errorHandler(error).forEach((e) => + enqueueSnackbar("Error fetching GPU claims: " + e, { + variant: "error", + }) + ); + } + }; + const loadResourceMigrations = async () => { if (!(initialized && keycloak.authenticated && keycloak.token)) return; try { @@ -391,6 +410,7 @@ export const ResourceContextProvider = ({ loadZones(); loadGpuGroups(); loadGpuLeases(); + loadGpuClaims(); loadResourceMigrations(); // eslint-disable-next-line @@ -440,6 +460,7 @@ export const ResourceContextProvider = ({ setGpuGroups, gpuLeases, setGpuLeases, + gpuClaims, resourceMigrations, setResourceMigrations, queueJob, diff --git a/src/locales/en.json b/src/locales/en.json index f53eabd4..f62c7c1c 100644 --- a/src/locales/en.json +++ b/src/locales/en.json @@ -559,6 +559,16 @@ "maia-intro-body": "Are you a researcher or student in biomedical engineering at KTH who needs state-of-the-art deep learning resources and compute? MAIA is the platform for developing, testing, and deploying medical AI, from early prototypes to real clinical workflows. You get straightforward access to compute via JupyterHub, SSH, or a virtual desktop. Our infrastructure, provided jointly with KTH Cloud, scales to different computational needs. Designed for collaboration and integration with hospital systems, MAIA lets you validate ideas and pilot your solutions in real-world clinical settings.", "maia-intro-footer": "Request a MAIA account today and register a project to start building your medical AI solution.", "maia-intro-header": "Discover ", - "button-get-started-maia": "Get started with MAIA" + "button-get-started-maia": "Get started with MAIA", + "use-vms": "Use VMs", + "landing-hero-gpus": "GPUs (deployment)", + "deployment-gpu": "GPU", + "deployment-gpu-configuration": "Configure GPUs for deployment", + "deployment-gpu-none": "No GPUs", + "deployment-gpu-add": "Add GPUs", + "deployment-gpu-subheader": "GPUs on deployments utilizes a new k8s feature called DRA combined with driver specific features to share GPUs between multiple containers.", + "deployment-gpu-quota": "GPU quota is user global, if your quota allows multiple GPUs you can allocate them how you like, multiple on a single deployment or single allocation for different deployments.", + "deployment-gpu-unstable": "This feature relies on BETA features from NVIDIA, and issues might occur. Feel free to reach out on discord if you have any questions.", + "clusters-overview": "Clusters" } } diff --git a/src/locales/se.json b/src/locales/se.json index c6a7c2f1..cf8fd437 100644 --- a/src/locales/se.json +++ b/src/locales/se.json @@ -560,6 +560,16 @@ "maia-intro-body": "Är du forskare eller student inom medicinsk teknik på KTH och behöver tillgång till datorresurser för deep learning och simuleringar? MAIA är en plattform för att utveckla, testa och implementera medicinsk AI, från tidiga prototyper till kliniska arbetsflöden. Du får åtkomst till beräkningsresurser via JupyterHub, SSH eller en virtual desktop. Vår infrastruktur, som tillhandahålls tillsammans med KTH Cloud, kan möta olika typer av behov för beräkningsresurser. MAIA är utformad för samarbete och integration med sjukhussystem och gör det möjligt för dig att validera idéer och testa dina lösningar även i kliniska miljöer.", "maia-intro-footer": "Skapa ett MAIA-konto redan idag och registrera ett projekt", "maia-intro-header": "Testa ", - "button-get-started-maia": "Kom igång med MAIA" + "button-get-started-maia": "Kom igång med MAIA", + "use-vms": "Använda VMar", + "landing-hero-gpus": "GPUs (deployment)", + "deployment-gpu": "GPU", + "deployment-gpu-configuration": "Konfigurera GPUer för deployment", + "deployment-gpu-none": "Inga GPUer", + "deployment-gpu-add": "Lägg till GPUer", + "deployment-gpu-subheader": "GPUs på deployments använder en ny k8s funktion som heter DRA kombinerat med drivrutin funktioner för att dela ett grafikkort mellan flera containrar.", + "deployment-gpu-quota": "GPU användnings kvoten är användarglobal, om din kvot tillåter flera GPUer kan du dela upp dom som du vill. Dvs antingen ett grafikkort per deployment, eller flera GPUer på en deployment.", + "deployment-gpu-unstable": "Denna funktion bygger på funktioner från NVIDIA som är i BETA, fel kan uppstå. Kontakta oss på discord om du har några frågor.", + "clusters-overview": "Kluster" } } diff --git a/src/pages/admin/AdminV2.tsx b/src/pages/admin/AdminV2.tsx index a4b85d88..706c048b 100644 --- a/src/pages/admin/AdminV2.tsx +++ b/src/pages/admin/AdminV2.tsx @@ -1,5 +1,7 @@ import { + Box, Card, + Chip, Container, LinearProgress, Link, @@ -43,6 +45,16 @@ import { deleteVM } from "../../api/deploy/vms"; import { deleteGpuLease } from "../../api/deploy/gpuLeases"; import { deleteTeam } from "../../api/deploy/teams"; import TimeLeft from "../../components/admin/TimeLeft"; +import { + GpuClaimConsumer, + GpuClaimRead, + GpuClaimStatus, +} from "../../temporaryTypesRemoveMe"; +import Iconify from "../../components/Iconify"; +import Label from "../../components/Label"; +import TimeAgo from "../../components/admin/TimeAgo"; +import CluseterOverviewTab from "../../components/admin/ClusterOverviewTab"; +import { deleteGpuClaim } from "../../api/deploy/gpuClaims"; export default function AdminV2() { const { tab: initialTab } = useParams(); @@ -113,6 +125,9 @@ export default function AdminV2() { setGpuGroupsPage, gpuGroupsPageSize, setGpuGroupsPageSize, + + // GpuClaims + gpuClaims, } = useAdmin(); const navigate = useNavigate(); @@ -310,6 +325,178 @@ export default function AdminV2() { }, ], }, + { + label: "GPU Claims", + columns: [ + { id: "id", label: "ID" }, + { id: "name", label: "Name" }, + { id: "zone", label: "Zone" }, + { + id: "*", + label: "Requested", + renderFunc: (claim: GpuClaimRead | undefined) => { + const requested = claim?.requested; + const allocated = claim?.allocated; + + if (!requested || Object.keys(requested).length === 0) { + return ( + + {t("gpuclaim-no-gpu-requested")} + + ); + } + + return ( + + {Object.entries(requested).map(([name, req]) => { + const allocs = allocated?.[name]; + const vendor = + (req.config as any)?.type || + (req.config as any)?.driver || + "unknown"; + const sharing = (req.config as any)?.parameters?.sharing + ?.strategy; + const sharingConfig = vendor?.toLowerCase().includes("nvidia") + ? sharing?.includes("MPS") + ? (req.config as any)?.parameters?.mpsConfig + : (req.config as any)?.parameters?.timeslicingConfig + : undefined; + const allocatedChip = allocs ? ( + + {(allocs as Array).map((alloc) => ( + + ))} + + ) : ( + + ); + + return ( + + + + {allocatedChip} + + + + {req.count} {vendor} {sharing && `• ${sharing}`}{" "} + {sharingConfig && `• ${sharingConfig}`} + + + ); + })} + + ); + }, + }, + { + id: "consumers", + label: "Consumers", + renderFunc: (consumers: GpuClaimConsumer[] | undefined) => { + if (consumers == undefined) return <>; + return ( + + {consumers.map((c) => ( + + ))} + + ); + }, + }, + { + id: "status", + label: "Status", + renderFunc: (status: GpuClaimStatus) => { + const phase = status?.phase?.toLowerCase(); + + let color = "default"; + if (phase === "bound") color = "success"; + else if (phase === "pending") color = "info"; + else if (phase === "failed") color = "error"; + + return ( + + + {status?.lastSynced != undefined && ( + + )} + + ); + }, + }, + ], + actions: [ + { + label: t("button-delete"), + onClick: (claim: GpuClaimRead) => { + if (keycloak.token) deleteGpuClaim(keycloak.token, claim.id); + }, + withConfirm: true, + }, + ], + }, { label: "Users", columns: [ @@ -327,7 +514,7 @@ export default function AdminV2() { const calculatePercentage = (used: number, total: number) => total ? ((used / total) * 100).toFixed(1) : "0.0"; - const { cpu, ram, disk } = { + const { cpu, ram, disk, gpu } = { cpu: calculatePercentage( user.usage.cpuCores, user.quota.cpuCores @@ -337,6 +524,14 @@ export default function AdminV2() { user.usage.diskSize, user.quota.diskSize ), + gpu: + (user.usage as any).gpus != undefined && + (user.quota as any).gpus != undefined + ? calculatePercentage( + (user.usage as any).gpus, + (user.quota as any).gpus + ) + : undefined, }; return ( @@ -361,6 +556,20 @@ export default function AdminV2() { > + + {gpu && ( + <> + GPUs + + + + + )} ); }, @@ -443,6 +652,7 @@ export default function AdminV2() { {} ); tabLookup["hosts"] = resourceConfig.length; + tabLookup["overview"] = resourceConfig.length + 1; useEffect(() => { if ( initialTab && @@ -494,6 +704,14 @@ export default function AdminV2() { pageSize: gpuGroupsPageSize, setPageSize: setGpuGroupsPageSize, }, + { + data: gpuClaims, + setFilter: () => {}, + page: 0, + setPage: () => {}, + pageSize: gpuClaims?.length || 0, + setPageSize: () => {}, + }, { data: users, filter: usersFilter, @@ -555,6 +773,7 @@ export default function AdminV2() { /> )), , + , ]; useEffect(() => { @@ -588,6 +807,7 @@ export default function AdminV2() { ))} + {tabs[activeTab]} diff --git a/src/pages/create/Create.tsx b/src/pages/create/Create.tsx index 6e9a5d5d..89e39c44 100644 --- a/src/pages/create/Create.tsx +++ b/src/pages/create/Create.tsx @@ -37,7 +37,7 @@ export const Create = () => { const { initialized } = useKeycloak(); const { t } = useTranslation(); const { enqueueSnackbar } = useSnackbar(); - const { queueJob } = useResource(); + const { queueJob, user } = useResource(); const [alignment, _setAlignment] = useState(""); const setAlignment = (newAlignment: string) => { _setAlignment(newAlignment); @@ -118,7 +118,10 @@ export const Create = () => { + + + + {(!gpus || gpus.length === 0) && ( + <> + {/*@ts-ignore legacy api */} + + + theme.palette.action.hover, + textAlign: "center", + }} + > + + {t("deployment-gpu-none")} + + + + + )} + + {gpus?.map((gpu, index) => { + const isValid = validateGPU(gpu); + + const handleChange = ( + index: number, + field: keyof DeploymentGPU, + value: string + ) => { + const unset = field === "name" ? "" : undefined; + const trimmed = value.trim(); + setGpus((prev) => { + const updated = [...(prev || [])]; + updated[index] = { + ...updated[index], + [field]: trimmed === "" ? unset : trimmed, + }; + return updated; + }); + }; + + return ( + <> + {/*@ts-ignore legacy api */} + + + {gpu.claimName != "" && ( + { + handleChange( + index, + "name", + newValue ? newValue : "" + ); + }} + options={Object.keys( + gpuClaims?.find( + (g) => g.name === gpu.claimName + )?.requested ?? {} + )} + getOptionLabel={(option) => option} + renderInput={(params) => ( + + )} + isOptionEqualToValue={(option, value) => + option === value + } + disableClearable + /> + )} + { + handleChange( + index, + "claimName", + newValue ? newValue : "" + ); + }} + options={ + gpuClaims + ?.filter((c) => c.zone == resource.zone) + .map((c) => c.name) || [] + } + getOptionLabel={(option) => option} + renderInput={(params) => ( + + )} + isOptionEqualToValue={(option, value) => + option === value + } + disableClearable + /> + + + setGpus((prev) => + prev?.filter((_, i) => i !== index) + ) + } + > + + + + + + + ); + })} + + + {/*@ts-ignore legacy api */} + theme.palette.action.hover, + borderRadius: "1rem", + }} + > + {/*@ts-ignore legacy api */} + + {gpus?.some((g) => !validateGPU(g)) && ( + + {t("deployment-gpu-invalid-config-warning")} + + )} + + + + )} )} @@ -602,3 +900,20 @@ export const Specs = ({ resource }: { resource: Resource }) => { ); }; + +const gpusEqual = (a?: DeploymentGPU[], b?: DeploymentGPU[]): boolean => { + if (!a && !b) return true; + if (!a || !b) return false; + if (a.length !== b.length) return false; + + return a.every((gpuA, index) => { + const gpuB = b[index]; + if (!gpuB) return false; + + const keys = new Set([...Object.keys(gpuA), ...Object.keys(gpuB)]); + for (const key of keys) { + if ((gpuA as any)[key] !== (gpuB as any)[key]) return false; + } + return true; + }); +}; diff --git a/src/pages/tiers/Tiers.tsx b/src/pages/tiers/Tiers.tsx index fea89b76..a9954a52 100644 --- a/src/pages/tiers/Tiers.tsx +++ b/src/pages/tiers/Tiers.tsx @@ -61,6 +61,14 @@ const TierCard = ({ tier }: { tier: Role }) => { alignItems="flex-start" justifyContent={"flex-start"} > + + {(tier.permissions.includes("useVms") ? "✅ " : "❌ ") + + t("use-vms")} + { > {`🧠 ${t("memory")}: ${tier.quota.ram} GB`} + {(tier.quota as any).gpus != undefined && ( + + {`🤖 ${t("landing-hero-gpus")}: ${(tier.quota as any).gpus}`} + + )} ; + /** + * Allocated contains the GPUs that have been successfully bound/allocated. + */ + allocated?: Record; + /** + * Consumers are the workloads currently using this claim. + */ + consumers?: GpuClaimConsumer[]; + /** + * Status reflects the reconciliation and/or lifecycle state. + */ + status?: GpuClaimStatus; + /** + * LastError holds the last reconciliation or provisioning error message. + */ + lastError?: string; + createdAt: string; + updatedAt?: string; +} +export interface GpuClaimCreate { + name: string; + zone?: string; + allowedRoles?: string[]; + /** + * Requested contains all requested GPU configurations by key (request.Name). + */ + requested?: RequestedGpuCreate[]; +} +export interface GpuClaimCreated { + id: string; + jobId: string; +} +export interface RequestedGpuCreate extends RequestedGpu { + name: string; +} +/** + * RequestedGpu describes the desired GPU configuration that was requested. + */ +export interface RequestedGpu { + allocationMode: string; + capacity?: { [key: string]: string }; + count?: number /* int64 */; + deviceClassName: string; + selectors?: string[]; + config?: GpuDeviceConfigurationWrapper; +} +export interface GpuDeviceConfigurationWrapper {} +/** + * GpuDeviceConfiguration represents a vendor-specific GPU configuration. + */ +export type GpuDeviceConfiguration = any /* json.Marshaler */; +/** + * GenericDeviceConfiguration is a catch-all configuration when no vendor-specific struct is used. + */ +export interface GenericDeviceConfiguration { + driver: string; +} +/** + * NvidiaDeviceConfiguration represents NVIDIA-specific configuration options. + */ +export interface NvidiaDeviceConfiguration { + driver: string; + parameters?: any /* nvidia.GpuConfig */; +} +/** + * AllocatedGpu represents a concrete allocated GPU or GPU share. + */ +export interface AllocatedGpu { + pool?: string; + device?: string; + shareID?: string; + adminAccess?: boolean; +} +/** + * GpuClaimConsumer describes a workload consuming this GPU claim. + */ +export interface GpuClaimConsumer { + apiGroup?: string; + resource?: string; + name?: string; + uid?: string; +} +/** + * GpuClaimStatus represents runtime state and metadata about allocation progress. + */ +export interface GpuClaimStatus { + phase?: string; + message?: string; + updatedAt?: string; + lastSynced?: string; +} diff --git a/src/theme/overrides/Backdrop.ts b/src/theme/overrides/Backdrop.ts index 708dd141..61b69a1a 100644 --- a/src/theme/overrides/Backdrop.ts +++ b/src/theme/overrides/Backdrop.ts @@ -9,6 +9,7 @@ export default function Backdrop(theme: CustomTheme) { MuiBackdrop: { styleOverrides: { root: { + backgroundColor: "rgb(22,28,36", background: [ `rgb(22,28,36)`, `-moz-linear-gradient(75deg, ${varLow} 0%, ${varHigh} 100%)`, diff --git a/src/types.ts b/src/types.ts index 3d952614..46236662 100644 --- a/src/types.ts +++ b/src/types.ts @@ -2,6 +2,7 @@ import { UserRead, DeploymentRead, JobRead, + DeploymentSpecs, } from "@kthcloud/go-deploy-types/types/v2/body/index"; import { VmRead as V2VmRead, @@ -17,9 +18,19 @@ export interface Vm extends V2VmRead { type: "vm"; } +export interface DeploymentGPU { + name: string; + claimName: string; +} + +export interface DeploymentSpecsGPU extends DeploymentSpecs { + gpus?: DeploymentGPU[]; +} + export interface Deployment extends DeploymentRead { type: "deployment"; deploymentType?: string; + specs: DeploymentSpecsGPU; } export type Resource = Vm | Deployment; diff --git a/tsconfig.json b/tsconfig.json index 12fac75c..9fe9c0c4 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -21,7 +21,10 @@ "noFallthroughCasesInSwitch": true, "noImplicitAny": true, "noImplicitThis": true, - "strictNullChecks": true + "strictNullChecks": true, + + /* vite */ + "types": ["vite/client"] }, "include": ["src"], "references": [{ "path": "./tsconfig.node.json" }]