From 121816e19c9c7601e93987c0a7b99b6d51e4d372 Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Wed, 3 Jun 2026 11:56:14 -0500 Subject: [PATCH 1/7] update ananotomy.cpp to use USM --- adoc/code/anatomy.cpp | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp index 702c9523e..b0b76dfde 100644 --- a/adoc/code/anatomy.cpp +++ b/adoc/code/anatomy.cpp @@ -1,39 +1,26 @@ -// Copyright (c) 2011-2026 The Khronos Group, Inc. -// SPDX-License-Identifier: Apache-2.0 - #include #include -using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names +using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names int main() { - int data[1024]; // Allocate data to be worked on - // Create a default queue to enqueue work to the default device - queue myQueue; + queue myQueue{}; + // Allocate shared memory to be worked on on the default device + int *A_shared = malloc_shared(1024, myQueue); - // By wrapping all the SYCL work in a {} block, we ensure - // all SYCL tasks must complete before exiting the block, - // because the destructor of resultBuf will wait - { - // Wrap our data variable in a buffer - buffer resultBuf{data, {1024}}; + // Enqueue a parallel_for task with 1024 work-items + myQueue.parallel_for(1024, [=](id<1> idx) { + // Initialize each buffer element with its own rank number starting at 0 + A_shared[idx] = idx; + }); // End of the kernel function - // Create a command group to issue commands to the queue - myQueue.submit([&](handler& cgh) { - // Request write access to the buffer without initialization - accessor writeResult{resultBuf, cgh, write_only, no_init}; + myQueue.wait(); // Wait for the queue to finish executing on the device - // Enqueue a parallel_for task with 1024 work-items - cgh.parallel_for(1024, [=](id<1> idx) { - // Initialize each buffer element with its own rank number starting at 0 - writeResult[idx] = idx; - }); // End of the kernel function - }); // End of our commands for this queue - } // End of scope, so we wait for work producing resultBuf to complete + // Reclaim memory on the host and the device + free(A_shared, myQueue); // Print result for (int i = 0; i < 1024; i++) - std::cout << "data[" << i << "] = " << data[i] << std::endl; - + std::cout << "A_shared[" << i << "] = " << A_shared[i] << std::endl; return 0; } From 33012a725ad084a80e3c2dd0f814a2ab5063aaae Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Thu, 4 Jun 2026 15:25:53 -0500 Subject: [PATCH 2/7] Use in_order queue and malloc_device --- adoc/code/anatomy.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp index b0b76dfde..51df5c243 100644 --- a/adoc/code/anatomy.cpp +++ b/adoc/code/anatomy.cpp @@ -2,25 +2,32 @@ #include using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names +std::vector A_host(1024); + int main() { // Create a default queue to enqueue work to the default device - queue myQueue{}; + queue myQueue{property::queue::in_order()}; + // Allocate shared memory to be worked on on the default device - int *A_shared = malloc_shared(1024, myQueue); + int *A_device = malloc_device(1024, myQueue); // Enqueue a parallel_for task with 1024 work-items myQueue.parallel_for(1024, [=](id<1> idx) { // Initialize each buffer element with its own rank number starting at 0 - A_shared[idx] = idx; + A_device[idx] = idx; }); // End of the kernel function - myQueue.wait(); // Wait for the queue to finish executing on the device + // Copy the results back to the device from the host + myQueue.copy(A_device, A_host.data(), 1024); - // Reclaim memory on the host and the device - free(A_shared, myQueue); + myQueue.wait(); // Wait for the queue to finish executing on the device // Print result for (int i = 0; i < 1024; i++) - std::cout << "A_shared[" << i << "] = " << A_shared[i] << std::endl; + std::cout << "A_shared[" << i << "] = " << A_host[i] << std::endl; + + // Reclaim memory on the host and the device + free(A_device, myQueue); + return 0; } From 8e9e7908221d6fb81f1dc8ccb740901825439c51 Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Mon, 8 Jun 2026 10:21:11 -0500 Subject: [PATCH 3/7] update anatomy.cpp with better comments --- adoc/code/anatomy.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp index 51df5c243..bea288dde 100644 --- a/adoc/code/anatomy.cpp +++ b/adoc/code/anatomy.cpp @@ -1,14 +1,16 @@ #include #include +#include using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names -std::vector A_host(1024); - int main() { - // Create a default queue to enqueue work to the default device + // Allocate host memory to store the results + std::vector A_host(1024); + + // Create an in order queue to enqueue work to the default device queue myQueue{property::queue::in_order()}; - // Allocate shared memory to be worked on on the default device + // Allocate device memory to be worked on int *A_device = malloc_device(1024, myQueue); // Enqueue a parallel_for task with 1024 work-items @@ -17,16 +19,16 @@ int main() { A_device[idx] = idx; }); // End of the kernel function - // Copy the results back to the device from the host + // Copy the results back to the host from the device myQueue.copy(A_device, A_host.data(), 1024); - myQueue.wait(); // Wait for the queue to finish executing on the device + myQueue.wait(); // Wait for the queue to finish executing all the tasks // Print result for (int i = 0; i < 1024; i++) - std::cout << "A_shared[" << i << "] = " << A_host[i] << std::endl; + std::cout << "A_host[" << i << "] = " << A_host[i] << std::endl; - // Reclaim memory on the host and the device + // Free device memory free(A_device, myQueue); return 0; From 6379a369b01c09d299af50c714daa632cfc70394 Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Mon, 8 Jun 2026 14:45:45 -0500 Subject: [PATCH 4/7] update surrounding documentation around anatomy.cpp --- adoc/chapters/architecture.adoc | 43 +++++++++++++++++++++------------ adoc/code/anatomy.cpp | 3 +++ 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc index cb45c8852..389eecaf3 100644 --- a/adoc/chapters/architecture.adoc +++ b/adoc/chapters/architecture.adoc @@ -81,10 +81,13 @@ The application is structured in three scopes which specify the different sections; <>, <> and <>. The <> specifies a single kernel function that will be, or has been, compiled by a <> and executed on a <>. -In this example <> is defined by lines 25 to 26. +In this example <> is defined by lines 18 to 19, the body of the +lambda passed to [code]#parallel_for#. The <> specifies a unit of work which is comprised of a -<> and <>. -In this example <> is defined by lines 20 to 28. +<> and any associated requirements. +In this example the kernel is launched using the [code]#parallel_for# queue +shortcut on lines 17 to 20, which implicitly defines the <> +around the kernel invocation. The <> specifies all other code outside of a <>. These three scopes are used to control the application flow and the construction @@ -114,27 +117,35 @@ The different member functions to execute kernels can be found in <>. A <> is the syntactic scope wrapped by the construction of -a <> as seen on line 19. -The <> may invoke only a single -<>, and it takes a parameter of type command group -[code]#handler#, which is constructed by the runtime. +a <>. +In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is +used to implicitly define a command group containing a single +<>. +Alternatively, [code]#myQueue.submit# may be called with an explicit +<>, which takes a parameter of type command group +[code]#handler# constructed by the runtime, and may invoke only a single +<>. All the requirements for a kernel to execute are defined in this <>, as described in <>. -In this case the constructor used for [code]#myQueue# on line 9 is the default -constructor, which allows the queue to select the best underlying device to +In this case [code]#myQueue# is constructed on line 11 with the +[code]#property::queue::in_order# property, which causes commands submitted to +the queue to execute in the order they are enqueued. +No <> is specified, so the queue selects the best underlying device to execute on, leaving the decision up to the runtime. In SYCL, data that is required within a <> must be contained within a <>, <>, or <> allocation, as described in <>. -We construct a buffer on line 16. -Access to the <> is controlled via an <> which is constructed -on line 21. -The <> is used to keep track of access to the data and the <> -is used to request access to the data on a queue, as well as to track the -dependencies between <>. -In this example the <> is used to write to the data buffer on line 26. +In this example we use a USM device allocation, created with +[code]#malloc_device# on line 14, which returns a pointer to memory accessible +on the <> associated with [code]#myQueue#. +The kernel writes through this pointer on line 19. +Because device allocations are not accessible from the host, the results are +explicitly copied back into a host [code]##std::vector## using +[code]#myQueue.copy# on line 23, and the call to [code]#myQueue.wait# on line 25 +ensures all enqueued operations have completed before the host reads the data. +Finally, the device allocation is released with [code]#free# on line 32. [[sec:platformmodel]] diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp index bea288dde..f55af7e1e 100644 --- a/adoc/code/anatomy.cpp +++ b/adoc/code/anatomy.cpp @@ -1,3 +1,6 @@ +// Copyright (c) 2011-2026 The Khronos Group, Inc. +// SPDX-License-Identifier: Apache-2.0 + #include #include #include From 35f9a1161a900e257b822f6d1019ecac1ab8612a Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Tue, 9 Jun 2026 10:25:15 -0500 Subject: [PATCH 5/7] use consistent naming convention --- adoc/code/anatomy.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp index f55af7e1e..815844ec0 100644 --- a/adoc/code/anatomy.cpp +++ b/adoc/code/anatomy.cpp @@ -8,31 +8,31 @@ using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names int main() { // Allocate host memory to store the results - std::vector A_host(1024); + std::vector dataHost(1024); // Create an in order queue to enqueue work to the default device queue myQueue{property::queue::in_order()}; // Allocate device memory to be worked on - int *A_device = malloc_device(1024, myQueue); + int *dataDevice = malloc_device(1024, myQueue); // Enqueue a parallel_for task with 1024 work-items myQueue.parallel_for(1024, [=](id<1> idx) { // Initialize each buffer element with its own rank number starting at 0 - A_device[idx] = idx; + dataDevice[idx] = idx; }); // End of the kernel function // Copy the results back to the host from the device - myQueue.copy(A_device, A_host.data(), 1024); + myQueue.copy(dataDevice, dataHost.data(), 1024); myQueue.wait(); // Wait for the queue to finish executing all the tasks // Print result for (int i = 0; i < 1024; i++) - std::cout << "A_host[" << i << "] = " << A_host[i] << std::endl; + std::cout << "dataHost[" << i << "] = " << dataHost[i] << std::endl; // Free device memory - free(A_device, myQueue); + free(dataDevice, myQueue); return 0; } From 9e7cc627b0c9125d91ae3de698ba6b29ec14b3f3 Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Tue, 9 Jun 2026 16:22:55 -0500 Subject: [PATCH 6/7] remove references to command group --- adoc/chapters/architecture.adoc | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc index 389eecaf3..d5ebf8633 100644 --- a/adoc/chapters/architecture.adoc +++ b/adoc/chapters/architecture.adoc @@ -116,15 +116,9 @@ executed over the given range from 0 to 1023. The different member functions to execute kernels can be found in <>. -A <> is the syntactic scope wrapped by the construction of -a <>. In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is used to implicitly define a command group containing a single -<>. -Alternatively, [code]#myQueue.submit# may be called with an explicit -<>, which takes a parameter of type command group -[code]#handler# constructed by the runtime, and may invoke only a single -<>. +<>. All queue shortcuts can be found in <>. All the requirements for a kernel to execute are defined in this <>, as described in <>. @@ -135,14 +129,14 @@ No <> is specified, so the queue selects the best underlying device to execute on, leaving the decision up to the runtime. In SYCL, data that is required within a <> must be -contained within a <>, <>, or <> allocation, as described in +contained within a <>, <>, or <> allocation, as described in <>. In this example we use a USM device allocation, created with [code]#malloc_device# on line 14, which returns a pointer to memory accessible on the <> associated with [code]#myQueue#. The kernel writes through this pointer on line 19. Because device allocations are not accessible from the host, the results are -explicitly copied back into a host [code]##std::vector## using +explicitly copied back to the host [code]##std::vector## using [code]#myQueue.copy# on line 23, and the call to [code]#myQueue.wait# on line 25 ensures all enqueued operations have completed before the host reads the data. Finally, the device allocation is released with [code]#free# on line 32. From 2c72751475dba517bff7decf579bc36a79b47289 Mon Sep 17 00:00:00 2001 From: Michael Lance Date: Wed, 10 Jun 2026 10:59:32 -0500 Subject: [PATCH 7/7] final changes --- adoc/chapters/architecture.adoc | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc index d5ebf8633..5edf4d98f 100644 --- a/adoc/chapters/architecture.adoc +++ b/adoc/chapters/architecture.adoc @@ -79,7 +79,7 @@ the SYCL features that will be used. A SYCL application runs on a <>. The application is structured in three scopes which specify the different sections; <>, <> and <>. -The <> specifies a single kernel function that will be, or has +The <> specifies a <> that will be, or has been, compiled by a <> and executed on a <>. In this example <> is defined by lines 18 to 19, the body of the lambda passed to [code]#parallel_for#. @@ -87,7 +87,8 @@ The <> specifies a unit of work which is comprised of a <> and any associated requirements. In this example the kernel is launched using the [code]#parallel_for# queue shortcut on lines 17 to 20, which implicitly defines the <> -around the kernel invocation. +around the <>. +All queue shortcuts can be found in <>. The <> specifies all other code outside of a <>. These three scopes are used to control the application flow and the construction @@ -115,21 +116,17 @@ In the case of [code]#parallel_for# the <> will be executed over the given range from 0 to 1023. The different member functions to execute kernels can be found in <>. - -In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is -used to implicitly define a command group containing a single -<>. All queue shortcuts can be found in <>. - All the requirements for a kernel to execute are defined in this <>, as described in <>. -In this case [code]#myQueue# is constructed on line 11 with the -[code]#property::queue::in_order# property, which causes commands submitted to -the queue to execute in the order they are enqueued. -No <> is specified, so the queue selects the best underlying device to -execute on, leaving the decision up to the runtime. +Additionally, in this example, [code]#myQueue# is constructed on line 11 with +the [code]#property::queue::in_order# property, which causes commands submitted +to the queue to execute in the order they were enqueued. +No <> is specified in the queue constructor, so the constructor selects +the best underlying device to execute on, leaving the decision up to the +runtime. In SYCL, data that is required within a <> must be -contained within a <>, <>, or <> allocation, as described in +contained within a <> allocation, <>, or <> as described in <>. In this example we use a USM device allocation, created with [code]#malloc_device# on line 14, which returns a pointer to memory accessible