From 121816e19c9c7601e93987c0a7b99b6d51e4d372 Mon Sep 17 00:00:00 2001
From: Michael Lance <mblance@proton.me>
Date: Wed, 3 Jun 2026 11:56:14 -0500
Subject: [PATCH 1/7] update ananotomy.cpp to use USM

---
 adoc/code/anatomy.cpp | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)
diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp
index 702c9523e..b0b76dfde 100644
--- a/adoc/code/anatomy.cpp
+++ b/adoc/code/anatomy.cpp
@@ -1,39 +1,26 @@
-// Copyright (c) 2011-2026 The Khronos Group, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
 #include <iostream>
 #include <sycl/sycl.hpp>
-using namespace sycl;  // (optional) avoids need for "sycl::" before SYCL names
+using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names
 
 int main() {
-  int data[1024];  // Allocate data to be worked on
-
   // Create a default queue to enqueue work to the default device
-  queue myQueue;
+  queue myQueue{};
+  // Allocate shared memory to be worked on on the default device
+  int *A_shared = malloc_shared<int>(1024, myQueue);
 
-  // By wrapping all the SYCL work in a {} block, we ensure
-  // all SYCL tasks must complete before exiting the block,
-  // because the destructor of resultBuf will wait
-  {
-    // Wrap our data variable in a buffer
-    buffer resultBuf{data, {1024}};
+  // Enqueue a parallel_for task with 1024 work-items
+  myQueue.parallel_for(1024, [=](id<1> idx) {
+    // Initialize each buffer element with its own rank number starting at 0
+    A_shared[idx] = idx;
+  }); // End of the kernel function
 
-    // Create a command group to issue commands to the queue
-    myQueue.submit([&](handler& cgh) {
-      // Request write access to the buffer without initialization
-      accessor writeResult{resultBuf, cgh, write_only, no_init};
+  myQueue.wait(); // Wait for the queue to finish executing on the device
 
-      // Enqueue a parallel_for task with 1024 work-items
-      cgh.parallel_for(1024, [=](id<1> idx) {
-        // Initialize each buffer element with its own rank number starting at 0
-        writeResult[idx] = idx;
-      });  // End of the kernel function
-    });    // End of our commands for this queue
-  }  // End of scope, so we wait for work producing resultBuf to complete
+  // Reclaim memory on the host and the device
+  free(A_shared, myQueue);
 
   // Print result
   for (int i = 0; i < 1024; i++)
-    std::cout << "data[" << i << "] = " << data[i] << std::endl;
-
+    std::cout << "A_shared[" << i << "] = " << A_shared[i] << std::endl;
   return 0;
 }

From 33012a725ad084a80e3c2dd0f814a2ab5063aaae Mon Sep 17 00:00:00 2001
From: Michael Lance <mblance@proton.me>
Date: Thu, 4 Jun 2026 15:25:53 -0500
Subject: [PATCH 2/7] Use in_order queue and malloc_device

---
 adoc/code/anatomy.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp
index b0b76dfde..51df5c243 100644
--- a/adoc/code/anatomy.cpp
+++ b/adoc/code/anatomy.cpp
@@ -2,25 +2,32 @@
 #include <sycl/sycl.hpp>
 using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names
 
+std::vector<int> A_host(1024);
+
 int main() {
   // Create a default queue to enqueue work to the default device
-  queue myQueue{};
+  queue myQueue{property::queue::in_order()};
+
   // Allocate shared memory to be worked on on the default device
-  int *A_shared = malloc_shared<int>(1024, myQueue);
+  int *A_device = malloc_device<int>(1024, myQueue);
 
   // Enqueue a parallel_for task with 1024 work-items
   myQueue.parallel_for(1024, [=](id<1> idx) {
     // Initialize each buffer element with its own rank number starting at 0
-    A_shared[idx] = idx;
+    A_device[idx] = idx;
   }); // End of the kernel function
 
-  myQueue.wait(); // Wait for the queue to finish executing on the device
+  // Copy the results back to the device from the host
+  myQueue.copy(A_device, A_host.data(), 1024);
 
-  // Reclaim memory on the host and the device
-  free(A_shared, myQueue);
+  myQueue.wait(); // Wait for the queue to finish executing on the device
 
   // Print result
   for (int i = 0; i < 1024; i++)
-    std::cout << "A_shared[" << i << "] = " << A_shared[i] << std::endl;
+    std::cout << "A_shared[" << i << "] = " << A_host[i] << std::endl;
+
+  // Reclaim memory on the host and the device
+  free(A_device, myQueue);
+
   return 0;
 }

From 8e9e7908221d6fb81f1dc8ccb740901825439c51 Mon Sep 17 00:00:00 2001
From: Michael Lance <mlance4@uwyo.edu>
Date: Mon, 8 Jun 2026 10:21:11 -0500
Subject: [PATCH 3/7] update anatomy.cpp with better comments

---
 adoc/code/anatomy.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp
index 51df5c243..bea288dde 100644
--- a/adoc/code/anatomy.cpp
+++ b/adoc/code/anatomy.cpp
@@ -1,14 +1,16 @@
 #include <iostream>
 #include <sycl/sycl.hpp>
+#include <vector>
 using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names
 
-std::vector<int> A_host(1024);
-
 int main() {
-  // Create a default queue to enqueue work to the default device
+  // Allocate host memory to store the results
+  std::vector<int> A_host(1024);
+
+  // Create an in order queue to enqueue work to the default device
   queue myQueue{property::queue::in_order()};
 
-  // Allocate shared memory to be worked on on the default device
+  // Allocate device memory to be worked on
   int *A_device = malloc_device<int>(1024, myQueue);
 
   // Enqueue a parallel_for task with 1024 work-items
@@ -17,16 +19,16 @@ int main() {
     A_device[idx] = idx;
   }); // End of the kernel function
 
-  // Copy the results back to the device from the host
+  // Copy the results back to the host from the device
   myQueue.copy(A_device, A_host.data(), 1024);
 
-  myQueue.wait(); // Wait for the queue to finish executing on the device
+  myQueue.wait(); // Wait for the queue to finish executing all the tasks
 
   // Print result
   for (int i = 0; i < 1024; i++)
-    std::cout << "A_shared[" << i << "] = " << A_host[i] << std::endl;
+    std::cout << "A_host[" << i << "] = " << A_host[i] << std::endl;
 
-  // Reclaim memory on the host and the device
+  // Free device memory
   free(A_device, myQueue);
 
   return 0;

From 6379a369b01c09d299af50c714daa632cfc70394 Mon Sep 17 00:00:00 2001
From: Michael Lance <mlance4@uwyo.edu>
Date: Mon, 8 Jun 2026 14:45:45 -0500
Subject: [PATCH 4/7] update surrounding documentation around anatomy.cpp

---
 adoc/chapters/architecture.adoc | 43 +++++++++++++++++++++------------
 adoc/code/anatomy.cpp           |  3 +++
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc
index cb45c8852..389eecaf3 100644
--- a/adoc/chapters/architecture.adoc
+++ b/adoc/chapters/architecture.adoc
@@ -81,10 +81,13 @@ The application is structured in three scopes which specify the different
 sections; <<application-scope>>, <<command-group-scope>> and <<kernel-scope>>.
 The <<kernel-scope>> specifies a single kernel function that will be, or has
 been, compiled by a <<device-compiler>> and executed on a <<device>>.
-In this example <<kernel-scope>> is defined by lines 25 to 26.
+In this example <<kernel-scope>> is defined by lines 18 to 19, the body of the
+lambda passed to [code]#parallel_for#.
 The <<command-group-scope>> specifies a unit of work which is comprised of a
-<<sycl-kernel-function>> and <<accessor,accessors>>.
-In this example <<command-group-scope>> is defined by lines 20 to 28.
+<<sycl-kernel-function>> and any associated requirements.
+In this example the kernel is launched using the [code]#parallel_for# queue
+shortcut on lines 17 to 20, which implicitly defines the <<command-group-scope>>
+around the kernel invocation.
 The <<application-scope>> specifies all other code outside of a
 <<command-group-scope>>.
 These three scopes are used to control the application flow and the construction
@@ -114,27 +117,35 @@ The different member functions to execute kernels can be found in
 <<subsec:invokingkernels>>.
 
 A <<command-group-scope>> is the syntactic scope wrapped by the construction of
-a <<command-group-function-object>> as seen on line 19.
-The <<command-group-function-object>> may invoke only a single
-<<sycl-kernel-function>>, and it takes a parameter of type command group
-[code]#handler#, which is constructed by the runtime.
+a <<command-group-function-object>>.
+In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is
+used to implicitly define a command group containing a single
+<<sycl-kernel-function>>.
+Alternatively, [code]#myQueue.submit# may be called with an explicit
+<<command-group-function-object>>, which takes a parameter of type command group
+[code]#handler# constructed by the runtime, and may invoke only a single
+<<sycl-kernel-function>>.
 
 All the requirements for a kernel to execute are defined in this
 <<command-group-scope>>, as described in <<sec:executionmodel>>.
-In this case the constructor used for [code]#myQueue# on line 9 is the default
-constructor, which allows the queue to select the best underlying device to
+In this case [code]#myQueue# is constructed on line 11 with the
+[code]#property::queue::in_order# property, which causes commands submitted to
+the queue to execute in the order they are enqueued.
+No <<device>> is specified, so the queue selects the best underlying device to
 execute on, leaving the decision up to the runtime.
 
 In SYCL, data that is required within a <<sycl-kernel-function>> must be
 contained within a <<buffer>>, <<image>>, or <<usm>> allocation, as described in
 <<sec:memory.model>>.
-We construct a buffer on line 16.
-Access to the <<buffer>> is controlled via an <<accessor>> which is constructed
-on line 21.
-The <<buffer>> is used to keep track of access to the data and the <<accessor>>
-is used to request access to the data on a queue, as well as to track the
-dependencies between <<sycl-kernel-function>>.
-In this example the <<accessor>> is used to write to the data buffer on line 26.
+In this example we use a USM device allocation, created with
+[code]#malloc_device# on line 14, which returns a pointer to memory accessible
+on the <<device>> associated with [code]#myQueue#.
+The kernel writes through this pointer on line 19.
+Because device allocations are not accessible from the host, the results are
+explicitly copied back into a host [code]##std::vector## using
+[code]#myQueue.copy# on line 23, and the call to [code]#myQueue.wait# on line 25
+ensures all enqueued operations have completed before the host reads the data.
+Finally, the device allocation is released with [code]#free# on line 32.
 
 
 [[sec:platformmodel]]
diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp
index bea288dde..f55af7e1e 100644
--- a/adoc/code/anatomy.cpp
+++ b/adoc/code/anatomy.cpp
@@ -1,3 +1,6 @@
+// Copyright (c) 2011-2026 The Khronos Group, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
 #include <iostream>
 #include <sycl/sycl.hpp>
 #include <vector>

From 35f9a1161a900e257b822f6d1019ecac1ab8612a Mon Sep 17 00:00:00 2001
From: Michael Lance <mlance4@uwyo.edu>
Date: Tue, 9 Jun 2026 10:25:15 -0500
Subject: [PATCH 5/7] use consistent naming convention

---
 adoc/code/anatomy.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/adoc/code/anatomy.cpp b/adoc/code/anatomy.cpp
index f55af7e1e..815844ec0 100644
--- a/adoc/code/anatomy.cpp
+++ b/adoc/code/anatomy.cpp
@@ -8,31 +8,31 @@ using namespace sycl; // (optional) avoids need for "sycl::" before SYCL names
 
 int main() {
   // Allocate host memory to store the results
-  std::vector<int> A_host(1024);
+  std::vector<int> dataHost(1024);
 
   // Create an in order queue to enqueue work to the default device
   queue myQueue{property::queue::in_order()};
 
   // Allocate device memory to be worked on
-  int *A_device = malloc_device<int>(1024, myQueue);
+  int *dataDevice = malloc_device<int>(1024, myQueue);
 
   // Enqueue a parallel_for task with 1024 work-items
   myQueue.parallel_for(1024, [=](id<1> idx) {
     // Initialize each buffer element with its own rank number starting at 0
-    A_device[idx] = idx;
+    dataDevice[idx] = idx;
   }); // End of the kernel function
 
   // Copy the results back to the host from the device
-  myQueue.copy(A_device, A_host.data(), 1024);
+  myQueue.copy(dataDevice, dataHost.data(), 1024);
 
   myQueue.wait(); // Wait for the queue to finish executing all the tasks
 
   // Print result
   for (int i = 0; i < 1024; i++)
-    std::cout << "A_host[" << i << "] = " << A_host[i] << std::endl;
+    std::cout << "dataHost[" << i << "] = " << dataHost[i] << std::endl;
 
   // Free device memory
-  free(A_device, myQueue);
+  free(dataDevice, myQueue);
 
   return 0;
 }

From 9e7cc627b0c9125d91ae3de698ba6b29ec14b3f3 Mon Sep 17 00:00:00 2001
From: Michael Lance <mlance4@uwyo.edu>
Date: Tue, 9 Jun 2026 16:22:55 -0500
Subject: [PATCH 6/7] remove references to command group

---
 adoc/chapters/architecture.adoc | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc
index 389eecaf3..d5ebf8633 100644
--- a/adoc/chapters/architecture.adoc
+++ b/adoc/chapters/architecture.adoc
@@ -116,15 +116,9 @@ executed over the given range from 0 to 1023.
 The different member functions to execute kernels can be found in
 <<subsec:invokingkernels>>.
 
-A <<command-group-scope>> is the syntactic scope wrapped by the construction of
-a <<command-group-function-object>>.
 In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is
 used to implicitly define a command group containing a single
-<<sycl-kernel-function>>.
-Alternatively, [code]#myQueue.submit# may be called with an explicit
-<<command-group-function-object>>, which takes a parameter of type command group
-[code]#handler# constructed by the runtime, and may invoke only a single
-<<sycl-kernel-function>>.
+<<sycl-kernel-function>>. All queue shortcuts can be found in <<sec:queue-class>>.
 
 All the requirements for a kernel to execute are defined in this
 <<command-group-scope>>, as described in <<sec:executionmodel>>.
@@ -135,14 +129,14 @@ No <<device>> is specified, so the queue selects the best underlying device to
 execute on, leaving the decision up to the runtime.
 
 In SYCL, data that is required within a <<sycl-kernel-function>> must be
-contained within a <<buffer>>, <<image>>, or <<usm>> allocation, as described in
+contained within a <<usm>>, <<image>>, or <<buffer>> allocation, as described in
 <<sec:memory.model>>.
 In this example we use a USM device allocation, created with
 [code]#malloc_device# on line 14, which returns a pointer to memory accessible
 on the <<device>> associated with [code]#myQueue#.
 The kernel writes through this pointer on line 19.
 Because device allocations are not accessible from the host, the results are
-explicitly copied back into a host [code]##std::vector## using
+explicitly copied back to the host [code]##std::vector## using
 [code]#myQueue.copy# on line 23, and the call to [code]#myQueue.wait# on line 25
 ensures all enqueued operations have completed before the host reads the data.
 Finally, the device allocation is released with [code]#free# on line 32.

From 2c72751475dba517bff7decf579bc36a79b47289 Mon Sep 17 00:00:00 2001
From: Michael Lance <mlance4@uwyo.edu>
Date: Wed, 10 Jun 2026 10:59:32 -0500
Subject: [PATCH 7/7] final changes

---
 adoc/chapters/architecture.adoc | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/adoc/chapters/architecture.adoc b/adoc/chapters/architecture.adoc
index d5ebf8633..5edf4d98f 100644
--- a/adoc/chapters/architecture.adoc
+++ b/adoc/chapters/architecture.adoc
@@ -79,7 +79,7 @@ the SYCL features that will be used.
 A SYCL application runs on a <<sec:platformmodel, SYCL Platform>>.
 The application is structured in three scopes which specify the different
 sections; <<application-scope>>, <<command-group-scope>> and <<kernel-scope>>.
-The <<kernel-scope>> specifies a single kernel function that will be, or has
+The <<kernel-scope>> specifies a <<sycl-kernel-function>> that will be, or has
 been, compiled by a <<device-compiler>> and executed on a <<device>>.
 In this example <<kernel-scope>> is defined by lines 18 to 19, the body of the
 lambda passed to [code]#parallel_for#.
@@ -87,7 +87,8 @@ The <<command-group-scope>> specifies a unit of work which is comprised of a
 <<sycl-kernel-function>> and any associated requirements.
 In this example the kernel is launched using the [code]#parallel_for# queue
 shortcut on lines 17 to 20, which implicitly defines the <<command-group-scope>>
-around the kernel invocation.
+around the <<kernel-scope>>.
+All queue shortcuts can be found in <<sec:queue-class>>.
 The <<application-scope>> specifies all other code outside of a
 <<command-group-scope>>.
 These three scopes are used to control the application flow and the construction
@@ -115,21 +116,17 @@ In the case of [code]#parallel_for# the <<sycl-kernel-function>> will be
 executed over the given range from 0 to 1023.
 The different member functions to execute kernels can be found in
 <<subsec:invokingkernels>>.
-
-In this example the [code]#parallel_for# queue shortcut on lines 17 to 20 is
-used to implicitly define a command group containing a single
-<<sycl-kernel-function>>. All queue shortcuts can be found in <<sec:queue-class>>.
-
 All the requirements for a kernel to execute are defined in this
 <<command-group-scope>>, as described in <<sec:executionmodel>>.
-In this case [code]#myQueue# is constructed on line 11 with the
-[code]#property::queue::in_order# property, which causes commands submitted to
-the queue to execute in the order they are enqueued.
-No <<device>> is specified, so the queue selects the best underlying device to
-execute on, leaving the decision up to the runtime.
+Additionally, in this example, [code]#myQueue# is constructed on line 11 with
+the [code]#property::queue::in_order# property, which causes commands submitted
+to the queue to execute in the order they were enqueued.
+No <<device>> is specified in the queue constructor, so the constructor selects
+the best underlying device to execute on, leaving the decision up to the
+runtime.
 
 In SYCL, data that is required within a <<sycl-kernel-function>> must be
-contained within a <<usm>>, <<image>>, or <<buffer>> allocation, as described in
+contained within a <<usm>> allocation, <<image>>, or <<buffer>> as described in
 <<sec:memory.model>>.
 In this example we use a USM device allocation, created with
 [code]#malloc_device# on line 14, which returns a pointer to memory accessible