From a20f6babc8ac9d1c024e00dceeecae13eb6dff6e Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sun, 20 Apr 2025 20:40:35 +0530
Subject: [PATCH 01/36] Update darray.jl

---
 src/array/darray.jl | 207 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 201 insertions(+), 6 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index 37c61a936..187d6b2fd 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -47,10 +47,6 @@ function project(a::ArrayDomain, b::ArrayDomain)
     end |> ArrayDomain
 end
 
-function getindex(a::ArrayDomain, b::ArrayDomain)
-    ArrayDomain(map(getindex, indexes(a), indexes(b)))
-end
-
 """
     alignfirst(a) -> ArrayDomain
 
@@ -148,8 +144,7 @@ const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
 
 # mainly for backwards-compatibility
-DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} =
-    DArray(T, domain, subdomains, chunks, partitioning, concat)
+DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = DArray(T, domain, subdomains, chunks, partitioning, concat)
 
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::AbstractArray{DArrayDomain{N}, N},
@@ -207,6 +202,205 @@ Base.wait(A::DArray) = foreach(wait, A.chunks)
     end
 else
 =#
+
+###
+
+mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
+    darray::DArray{T,N,B,F}
+    pdomain::AbstractArray{Dagger.Processor, N}
+    # function DArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function) where {T,N,B,F}
+    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat)
+    # end
+end
+
+const DBCMatrix{T} = DBCArray{T,2}
+const DBCVector{T} = DBCArray{T,1}
+
+# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
+
+# function DArray(T, domain::DArrayDomain{N},
+#     subdomains::AbstractArray{DArrayDomain{N}, N},
+#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
+# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
+# end
+
+# function DArray(T, domain::DArrayDomain{N},
+    #     subdomains::DArrayDomain{N},
+    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
+    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
+    # _subdomains[1] = subdomains
+    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
+    # _chunks[1] = chunks
+    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
+# end
+
+domain(d::DBCArray) = domain(d.darray)
+chunks(d::DBCArray) = chunks(d.darray)
+domainchunks(d::DBCArray) = domainchunks(d.darray)
+size(x::DBCArray) = size(domain(x))
+stage(ctx, c::DBCArray) = stage(ctx, c.darray)
+
+function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
+    
+    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
+    missing = filter(p -> p ∉ all_procs, pdomain)
+    isempty(missing) || error("Missing processors: $missing")
+    
+    Ac = fetch(A.chunks)
+    Ac_copy = similar(A.chunks)
+
+    for idx in CartesianIndices(A.chunks)
+        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
+        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
+        # new_chunks[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) Dagger.tochunk(old_chunks[idx], proc)
+    end
+    
+    # Construct new DArray with updated chunks
+    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
+
+    return DBCArray{T,N,B,F}(A_copy, pdomain)
+end
+
+function Base.collect(d::DBCArray; tree=false)
+    return collect(d.darray; tree=tree)
+end
+
+Base.wait(A::DBCArray) = wait(A.darray.chunks)
+
+function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
+    nparts = N > 0 ? size(A.darray.chunks) : 1
+    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
+    nprocs = N > 0 ? size(A.pdomain) : 1
+    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
+    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
+    if pct_complete < 100
+        println(io)
+        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
+    end
+    println(io)
+    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
+end
+
+# function (==)(x::ArrayOp, y::ArrayOp)
+#     x === y || reduce((a,b)->a&&b, map(==, x, y))
+# end
+
+# function Base.hash(x::ArrayOp, i::UInt)
+#     7*objectid(x)-2
+# end
+
+# function Base.isequal(x::ArrayOp, y::ArrayOp)
+#     x === y
+# end
+
+Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
+
+Base.:(/)(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
+
+# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
+
+auto_blocks(A::DBCArray{T,N,B,F}) where {T,N,B,F} = auto_blocks(size(A))
+
+# distribute(A::AbstractArray) = distribute(A, AutoBlocks())
+# distribute(A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} =
+#     _to_darray(Distribute(dist, A))
+# distribute(A::AbstractArray, ::AutoBlocks) = distribute(A, auto_blocks(A))
+# function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
+#     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
+#     distribute(x, Blocks(p))
+# end
+# distribute(x::AbstractVector, n::Int) = distribute(x, (n,))
+# distribute(x::AbstractVector, n::Vector{<:Integer}) =
+#     distribute(x, DomainBlocks((1,), (cumsum(n),)))
+
+# DVector(A::AbstractVector{T}, part::Blocks{1}) where T = distribute(A, part)
+# DMatrix(A::AbstractMatrix{T}, part::Blocks{2}) where T = distribute(A, part)
+# DArray(A::AbstractArray{T,N}, part::Blocks{N}) where {T,N} = distribute(A, part)
+
+# DVector(A::AbstractVector{T}) where T = DVector(A, AutoBlocks())
+# DMatrix(A::AbstractMatrix{T}) where T = DMatrix(A, AutoBlocks())
+# DArray(A::AbstractArray) = DArray(A, AutoBlocks())
+
+# DVector(A::AbstractVector{T}, ::AutoBlocks) where T = DVector(A, auto_blocks(A))
+# DMatrix(A::AbstractMatrix{T}, ::AutoBlocks) where T = DMatrix(A, auto_blocks(A))
+# DArray(A::AbstractArray, ::AutoBlocks) = DArray(A, auto_blocks(A))
+
+# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
+#     collect(x) == y
+# end
+
+# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
+#     return collect(x) == y
+# end
+
+function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
+    for (idx, chunk) in enumerate(A.darray.chunks)
+        sd = A.subdomains[idx]
+        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
+    end
+end
+
+# function mapchunks(f, d::DArray{T,N,F}) where {T,N,F}
+#     chunks = map(d.chunks) do chunk
+#         owner = get_parent(chunk.processor).pid
+#         remotecall_fetch(mapchunk, owner, f, chunk)
+#     end
+#     DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat)
+# end
+
+
+const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
+const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
+const WrappedDBCVector{T} = WrappedDBCArray{T,1}
+
+function copydiag(f, A::DBCArray{T, 2}) where T
+    Ac = A.darray.chunks
+    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
+    _copytile(f, Ac) = copy(f(Ac))
+    for idx in CartesianIndices(Ac)
+        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
+        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
+    end
+    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
+    return DBCArray(Ad_copy, A.pdomain)
+end
+
+Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
+Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
+Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+
+(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
+
+# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
+
+
+Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
+
+Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
+
+Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
+Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
+
+
+# function Base.zero(x::DBCArray{T,N}) where {T,N}
+#     dims = ntuple(i->x.domain.indexes[i].stop, N)
+#     sd = first(x.subdomains)
+#     part_size = ntuple(i->sd.indexes[i].stop, N)
+#     a = zeros(Blocks(part_size...), T, dims)
+#     return _to_darray(a)
+# end
+
+###
+
     struct ColorElement{T}
         color::Symbol
         value::Union{Some{T},Nothing}
@@ -479,6 +673,7 @@ function stage(ctx::Context, d::Distribute)
                   d.partitioning)
 end
 
+
 """
     AutoBlocks
 

From f009ef3b501f9dd0b14210f1975786ac10507476 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sun, 20 Apr 2025 23:32:11 +0530
Subject: [PATCH 02/36] Update darray.jl

---
 src/array/darray.jl | 207 ++------------------------------------------
 1 file changed, 6 insertions(+), 201 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index 187d6b2fd..37c61a936 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -47,6 +47,10 @@ function project(a::ArrayDomain, b::ArrayDomain)
     end |> ArrayDomain
 end
 
+function getindex(a::ArrayDomain, b::ArrayDomain)
+    ArrayDomain(map(getindex, indexes(a), indexes(b)))
+end
+
 """
     alignfirst(a) -> ArrayDomain
 
@@ -144,7 +148,8 @@ const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
 
 # mainly for backwards-compatibility
-DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = DArray(T, domain, subdomains, chunks, partitioning, concat)
+DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} =
+    DArray(T, domain, subdomains, chunks, partitioning, concat)
 
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::AbstractArray{DArrayDomain{N}, N},
@@ -202,205 +207,6 @@ Base.wait(A::DArray) = foreach(wait, A.chunks)
     end
 else
 =#
-
-###
-
-mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
-    darray::DArray{T,N,B,F}
-    pdomain::AbstractArray{Dagger.Processor, N}
-    # function DArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function) where {T,N,B,F}
-    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat)
-    # end
-end
-
-const DBCMatrix{T} = DBCArray{T,2}
-const DBCVector{T} = DBCArray{T,1}
-
-# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
-
-# function DArray(T, domain::DArrayDomain{N},
-#     subdomains::AbstractArray{DArrayDomain{N}, N},
-#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
-# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
-# end
-
-# function DArray(T, domain::DArrayDomain{N},
-    #     subdomains::DArrayDomain{N},
-    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
-    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
-    # _subdomains[1] = subdomains
-    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
-    # _chunks[1] = chunks
-    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
-# end
-
-domain(d::DBCArray) = domain(d.darray)
-chunks(d::DBCArray) = chunks(d.darray)
-domainchunks(d::DBCArray) = domainchunks(d.darray)
-size(x::DBCArray) = size(domain(x))
-stage(ctx, c::DBCArray) = stage(ctx, c.darray)
-
-function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
-    
-    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
-    missing = filter(p -> p ∉ all_procs, pdomain)
-    isempty(missing) || error("Missing processors: $missing")
-    
-    Ac = fetch(A.chunks)
-    Ac_copy = similar(A.chunks)
-
-    for idx in CartesianIndices(A.chunks)
-        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
-        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
-        # new_chunks[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) Dagger.tochunk(old_chunks[idx], proc)
-    end
-    
-    # Construct new DArray with updated chunks
-    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
-
-    return DBCArray{T,N,B,F}(A_copy, pdomain)
-end
-
-function Base.collect(d::DBCArray; tree=false)
-    return collect(d.darray; tree=tree)
-end
-
-Base.wait(A::DBCArray) = wait(A.darray.chunks)
-
-function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
-    nparts = N > 0 ? size(A.darray.chunks) : 1
-    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
-    nprocs = N > 0 ? size(A.pdomain) : 1
-    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
-    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
-    if pct_complete < 100
-        println(io)
-        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
-    end
-    println(io)
-    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
-end
-
-# function (==)(x::ArrayOp, y::ArrayOp)
-#     x === y || reduce((a,b)->a&&b, map(==, x, y))
-# end
-
-# function Base.hash(x::ArrayOp, i::UInt)
-#     7*objectid(x)-2
-# end
-
-# function Base.isequal(x::ArrayOp, y::ArrayOp)
-#     x === y
-# end
-
-Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
-
-Base.:(/)(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
-
-# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
-
-auto_blocks(A::DBCArray{T,N,B,F}) where {T,N,B,F} = auto_blocks(size(A))
-
-# distribute(A::AbstractArray) = distribute(A, AutoBlocks())
-# distribute(A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} =
-#     _to_darray(Distribute(dist, A))
-# distribute(A::AbstractArray, ::AutoBlocks) = distribute(A, auto_blocks(A))
-# function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
-#     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
-#     distribute(x, Blocks(p))
-# end
-# distribute(x::AbstractVector, n::Int) = distribute(x, (n,))
-# distribute(x::AbstractVector, n::Vector{<:Integer}) =
-#     distribute(x, DomainBlocks((1,), (cumsum(n),)))
-
-# DVector(A::AbstractVector{T}, part::Blocks{1}) where T = distribute(A, part)
-# DMatrix(A::AbstractMatrix{T}, part::Blocks{2}) where T = distribute(A, part)
-# DArray(A::AbstractArray{T,N}, part::Blocks{N}) where {T,N} = distribute(A, part)
-
-# DVector(A::AbstractVector{T}) where T = DVector(A, AutoBlocks())
-# DMatrix(A::AbstractMatrix{T}) where T = DMatrix(A, AutoBlocks())
-# DArray(A::AbstractArray) = DArray(A, AutoBlocks())
-
-# DVector(A::AbstractVector{T}, ::AutoBlocks) where T = DVector(A, auto_blocks(A))
-# DMatrix(A::AbstractMatrix{T}, ::AutoBlocks) where T = DMatrix(A, auto_blocks(A))
-# DArray(A::AbstractArray, ::AutoBlocks) = DArray(A, auto_blocks(A))
-
-# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
-#     collect(x) == y
-# end
-
-# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
-#     return collect(x) == y
-# end
-
-function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
-    for (idx, chunk) in enumerate(A.darray.chunks)
-        sd = A.subdomains[idx]
-        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
-    end
-end
-
-# function mapchunks(f, d::DArray{T,N,F}) where {T,N,F}
-#     chunks = map(d.chunks) do chunk
-#         owner = get_parent(chunk.processor).pid
-#         remotecall_fetch(mapchunk, owner, f, chunk)
-#     end
-#     DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat)
-# end
-
-
-const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
-const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
-const WrappedDBCVector{T} = WrappedDBCArray{T,1}
-
-function copydiag(f, A::DBCArray{T, 2}) where T
-    Ac = A.darray.chunks
-    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
-    _copytile(f, Ac) = copy(f(Ac))
-    for idx in CartesianIndices(Ac)
-        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
-        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
-    end
-    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
-    return DBCArray(Ad_copy, A.pdomain)
-end
-
-Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
-Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
-Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-
-(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
-
-# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
-
-
-Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
-
-Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
-
-Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
-Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
-
-
-# function Base.zero(x::DBCArray{T,N}) where {T,N}
-#     dims = ntuple(i->x.domain.indexes[i].stop, N)
-#     sd = first(x.subdomains)
-#     part_size = ntuple(i->sd.indexes[i].stop, N)
-#     a = zeros(Blocks(part_size...), T, dims)
-#     return _to_darray(a)
-# end
-
-###
-
     struct ColorElement{T}
         color::Symbol
         value::Union{Some{T},Nothing}
@@ -673,7 +479,6 @@ function stage(ctx::Context, d::Distribute)
                   d.partitioning)
 end
 
-
 """
     AutoBlocks
 

From 04d2017d2633e7d03d60504f0a2e5ec538f09fb3 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sun, 20 Apr 2025 23:35:21 +0530
Subject: [PATCH 03/36] Create dbcarray.jl

---
 src/array/dbcarray.jl | 193 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 src/array/dbcarray.jl

diff --git a/src/array/dbcarray.jl b/src/array/dbcarray.jl
new file mode 100644
index 000000000..b8865786c
--- /dev/null
+++ b/src/array/dbcarray.jl
@@ -0,0 +1,193 @@
+mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
+    darray::DArray{T,N,B,F}
+    pdomain::AbstractArray{Dagger.Processor, N}
+    # function DArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function) where {T,N,B,F}
+    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat)
+    # end
+end
+
+const DBCMatrix{T} = DBCArray{T,2}
+const DBCVector{T} = DBCArray{T,1}
+
+# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
+
+# function DArray(T, domain::DArrayDomain{N},
+#     subdomains::AbstractArray{DArrayDomain{N}, N},
+#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
+# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
+# end
+
+# function DArray(T, domain::DArrayDomain{N},
+    #     subdomains::DArrayDomain{N},
+    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
+    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
+    # _subdomains[1] = subdomains
+    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
+    # _chunks[1] = chunks
+    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
+# end
+
+domain(d::DBCArray) = domain(d.darray)
+chunks(d::DBCArray) = chunks(d.darray)
+domainchunks(d::DBCArray) = domainchunks(d.darray)
+size(x::DBCArray) = size(domain(x))
+stage(ctx, c::DBCArray) = stage(ctx, c.darray)
+
+function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
+    
+    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
+    missing = filter(p -> p ∉ all_procs, pdomain)
+    isempty(missing) || error("Missing processors: $missing")
+    
+    Ac = fetch(A.chunks)
+    Ac_copy = similar(A.chunks)
+
+    for idx in CartesianIndices(A.chunks)
+        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
+        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
+        # new_chunks[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) Dagger.tochunk(old_chunks[idx], proc)
+    end
+    
+    # Construct new DArray with updated chunks
+    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
+
+    return DBCArray{T,N,B,F}(A_copy, pdomain)
+end
+
+function Base.collect(d::DBCArray; tree=false)
+    return collect(d.darray; tree=tree)
+end
+
+Base.wait(A::DBCArray) = wait(A.darray.chunks)
+
+function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
+    nparts = N > 0 ? size(A.darray.chunks) : 1
+    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
+    nprocs = N > 0 ? size(A.pdomain) : 1
+    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
+    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
+    if pct_complete < 100
+        println(io)
+        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
+    end
+    println(io)
+    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
+end
+
+# function (==)(x::ArrayOp, y::ArrayOp)
+#     x === y || reduce((a,b)->a&&b, map(==, x, y))
+# end
+
+# function Base.hash(x::ArrayOp, i::UInt)
+#     7*objectid(x)-2
+# end
+
+# function Base.isequal(x::ArrayOp, y::ArrayOp)
+#     x === y
+# end
+
+Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
+
+Base.:(/)(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
+
+# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
+
+auto_blocks(A::DBCArray{T,N,B,F}) where {T,N,B,F} = auto_blocks(size(A))
+
+# distribute(A::AbstractArray) = distribute(A, AutoBlocks())
+# distribute(A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} =
+#     _to_darray(Distribute(dist, A))
+# distribute(A::AbstractArray, ::AutoBlocks) = distribute(A, auto_blocks(A))
+# function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
+#     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
+#     distribute(x, Blocks(p))
+# end
+# distribute(x::AbstractVector, n::Int) = distribute(x, (n,))
+# distribute(x::AbstractVector, n::Vector{<:Integer}) =
+#     distribute(x, DomainBlocks((1,), (cumsum(n),)))
+
+# DVector(A::AbstractVector{T}, part::Blocks{1}) where T = distribute(A, part)
+# DMatrix(A::AbstractMatrix{T}, part::Blocks{2}) where T = distribute(A, part)
+# DArray(A::AbstractArray{T,N}, part::Blocks{N}) where {T,N} = distribute(A, part)
+
+# DVector(A::AbstractVector{T}) where T = DVector(A, AutoBlocks())
+# DMatrix(A::AbstractMatrix{T}) where T = DMatrix(A, AutoBlocks())
+# DArray(A::AbstractArray) = DArray(A, AutoBlocks())
+
+# DVector(A::AbstractVector{T}, ::AutoBlocks) where T = DVector(A, auto_blocks(A))
+# DMatrix(A::AbstractMatrix{T}, ::AutoBlocks) where T = DMatrix(A, auto_blocks(A))
+# DArray(A::AbstractArray, ::AutoBlocks) = DArray(A, auto_blocks(A))
+
+# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
+#     collect(x) == y
+# end
+
+# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
+#     return collect(x) == y
+# end
+
+function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
+    for (idx, chunk) in enumerate(A.darray.chunks)
+        sd = A.subdomains[idx]
+        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
+    end
+end
+
+# function mapchunks(f, d::DArray{T,N,F}) where {T,N,F}
+#     chunks = map(d.chunks) do chunk
+#         owner = get_parent(chunk.processor).pid
+#         remotecall_fetch(mapchunk, owner, f, chunk)
+#     end
+#     DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat)
+# end
+
+
+const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
+const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
+const WrappedDBCVector{T} = WrappedDBCArray{T,1}
+
+function copydiag(f, A::DBCArray{T, 2}) where T
+    Ac = A.darray.chunks
+    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
+    _copytile(f, Ac) = copy(f(Ac))
+    for idx in CartesianIndices(Ac)
+        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
+        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
+    end
+    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
+    return DBCArray(Ad_copy, A.pdomain)
+end
+
+Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
+Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
+Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+
+(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
+
+# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
+
+
+Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
+
+Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
+
+Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
+Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
+
+
+# function Base.zero(x::DBCArray{T,N}) where {T,N}
+#     dims = ntuple(i->x.domain.indexes[i].stop, N)
+#     sd = first(x.subdomains)
+#     part_size = ntuple(i->sd.indexes[i].stop, N)
+#     a = zeros(Blocks(part_size...), T, dims)
+#     return _to_darray(a)
+# end

From 1f716e815a4903fc5ffa41bf0591c42bb50c2e54 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sun, 20 Apr 2025 23:57:25 +0530
Subject: [PATCH 04/36] Update Dagger.jl

---
 src/Dagger.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Dagger.jl b/src/Dagger.jl
index fd6395a4b..013fc2710 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -84,6 +84,7 @@ include("stream-transfer.jl")
 
 # Array computations
 include("array/darray.jl")
+include("array/dbcarray.jl")
 include("array/alloc.jl")
 include("array/map-reduce.jl")
 include("array/copy.jl")

From 161425610787d4332efb7ce5564409d6187bc6b4 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 07:49:14 +0530
Subject: [PATCH 05/36] Update dbcarray.jl

---
 src/array/dbcarray.jl | 84 +++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 55 deletions(-)

diff --git a/src/array/dbcarray.jl b/src/array/dbcarray.jl
index b8865786c..822dda903 100644
--- a/src/array/dbcarray.jl
+++ b/src/array/dbcarray.jl
@@ -1,11 +1,18 @@
+export DBCArray, DBCVector, DBCMatrix, 
+
+### darray.jl
+
 mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
     darray::DArray{T,N,B,F}
     pdomain::AbstractArray{Dagger.Processor, N}
-    # function DArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function) where {T,N,B,F}
-    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat)
+    # function DBCArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function, pdomain) where {T,N,B,F}
+    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat, pdomain)
     # end
 end
 
+const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
+const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
+const WrappedDBCVector{T} = WrappedDBCArray{T,1}
 const DBCMatrix{T} = DBCArray{T,2}
 const DBCVector{T} = DBCArray{T,1}
 
@@ -27,12 +34,6 @@ const DBCVector{T} = DBCArray{T,1}
     # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
 # end
 
-domain(d::DBCArray) = domain(d.darray)
-chunks(d::DBCArray) = chunks(d.darray)
-domainchunks(d::DBCArray) = domainchunks(d.darray)
-size(x::DBCArray) = size(domain(x))
-stage(ctx, c::DBCArray) = stage(ctx, c.darray)
-
 function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
     
     all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
@@ -54,6 +55,13 @@ function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N
     return DBCArray{T,N,B,F}(A_copy, pdomain)
 end
 
+domain(d::DBCArray) = domain(d.darray)
+chunks(d::DBCArray) = chunks(d.darray)
+domainchunks(d::DBCArray) = domainchunks(d.darray)
+size(x::DBCArray) = size(domain(x))
+stage(ctx, c::DBCArray) = stage(ctx, c.darray)
+pdomain(A::DBCArray) = A.pdomain
+
 function Base.collect(d::DBCArray; tree=false)
     return collect(d.darray; tree=tree)
 end
@@ -74,49 +82,24 @@ function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,
     Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
 end
 
-# function (==)(x::ArrayOp, y::ArrayOp)
-#     x === y || reduce((a,b)->a&&b, map(==, x, y))
-# end
-
-# function Base.hash(x::ArrayOp, i::UInt)
-#     7*objectid(x)-2
-# end
-
-# function Base.isequal(x::ArrayOp, y::ArrayOp)
-#     x === y
+# function Base.similar(A::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N}
+#     d = ArrayDomain(map(x->1:x, dims))
+#     p = A.partitioning
+#     a = AllocateArray(S, AllocateUndef{S}(), false, d, partition(p, d), p)
+#     return _to_darray(a)
 # end
 
 Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
 
 Base.:(/)(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
 
-# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
-
-auto_blocks(A::DBCArray{T,N,B,F}) where {T,N,B,F} = auto_blocks(size(A))
-
-# distribute(A::AbstractArray) = distribute(A, AutoBlocks())
-# distribute(A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} =
-#     _to_darray(Distribute(dist, A))
-# distribute(A::AbstractArray, ::AutoBlocks) = distribute(A, auto_blocks(A))
-# function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
-#     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
-#     distribute(x, Blocks(p))
+# function Base.view(c::DArray, d)
+#     subchunks, subdomains = lookup_parts(c, chunks(c), domainchunks(c), d)
+#     d1 = alignfirst(d)
+#     DArray(eltype(c), d1, subdomains, subchunks, c.partitioning, c.concat)
 # end
-# distribute(x::AbstractVector, n::Int) = distribute(x, (n,))
-# distribute(x::AbstractVector, n::Vector{<:Integer}) =
-#     distribute(x, DomainBlocks((1,), (cumsum(n),)))
-
-# DVector(A::AbstractVector{T}, part::Blocks{1}) where T = distribute(A, part)
-# DMatrix(A::AbstractMatrix{T}, part::Blocks{2}) where T = distribute(A, part)
-# DArray(A::AbstractArray{T,N}, part::Blocks{N}) where {T,N} = distribute(A, part)
-
-# DVector(A::AbstractVector{T}) where T = DVector(A, AutoBlocks())
-# DMatrix(A::AbstractMatrix{T}) where T = DMatrix(A, AutoBlocks())
-# DArray(A::AbstractArray) = DArray(A, AutoBlocks())
 
-# DVector(A::AbstractVector{T}, ::AutoBlocks) where T = DVector(A, auto_blocks(A))
-# DMatrix(A::AbstractMatrix{T}, ::AutoBlocks) where T = DMatrix(A, auto_blocks(A))
-# DArray(A::AbstractArray, ::AutoBlocks) = DArray(A, auto_blocks(A))
+# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
 
 # function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
 #     collect(x) == y
@@ -142,9 +125,7 @@ end
 # end
 
 
-const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
-const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
-const WrappedDBCVector{T} = WrappedDBCArray{T,1}
+### matrix.jl
 
 function copydiag(f, A::DBCArray{T, 2}) where T
     Ac = A.darray.chunks
@@ -170,6 +151,8 @@ Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
 # Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
 
 
+# indexing.jl
+
 Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
 
 Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
@@ -182,12 +165,3 @@ Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value,
 Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
 Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
 Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
-
-
-# function Base.zero(x::DBCArray{T,N}) where {T,N}
-#     dims = ntuple(i->x.domain.indexes[i].stop, N)
-#     sd = first(x.subdomains)
-#     part_size = ntuple(i->sd.indexes[i].stop, N)
-#     a = zeros(Blocks(part_size...), T, dims)
-#     return _to_darray(a)
-# end

From a85fecf420c5a190c37e87b17fa1e4dc10f5bcf4 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 07:58:01 +0530
Subject: [PATCH 06/36] Create DBCArray.md

---
 reports/DBCArray.md | 354 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 reports/DBCArray.md

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
new file mode 100644
index 000000000..eefc581a5
--- /dev/null
+++ b/reports/DBCArray.md
@@ -0,0 +1,354 @@
+# GSoC 2025 Report: Distributed Linear Algebra with Dagger.jl
+
+**Author**: Akhil Akkapelli
+
+**Mentor**: Julian Samaroo
+
+## Main Goal
+
+The objective of this project was to add distributed linear algebra capabilities to Dagger.jl. This involved implementing operations such as matrix multiplication and factorizations using various data distribution schemes (cyclic, block-cyclic, 2D, 3D) that can run efficiently across multiple devices using the Dagger.jl APIs.
+
+## Steps Toward Implementation
+
+### 1. Background Study and Design
+
+- Study the Dagger.jl documentation to understand its architecture and design principles.
+- Explore relevant source code files and become familiar with the internal mechanisms and how different components interact.
+
+### 2. Matrix Distribution Infrastructure
+
+- Develop a system to distribute `DArray` chunks using a block-cyclic layout to specific processor blocks.
+- Introduce new constructors and helper functions to ensure that each chunk runs exclusively on its assigned processor.
+- Update the scheduling logic to maintain fixed processor scopes during execution.
+- Integrate the distribution logic with Dagger's scheduler to manage task dependencies and execute operations correctly across the processor grid.
+
+### 3. Matrix Operations
+
+- Implement fundamental matrix operations such as `Adjoint`, `Transpose`, `*`, `+`, and `MatMul`, ensuring that operations respect processor assignments.
+- Add support for essential linear algebra routines like `norm2`, `issymmetric`, `ishermitian`, and factorizations including `lu` (with and without pivoting) and `cholesky`.
+- Ensure accurate indexing and chunk-to-processor mapping throughout execution.
+
+### 4. Testing and Performance Evaluation
+
+- Develop a comprehensive test suite to validate the correctness of operations for various matrix shapes, sizes, and distribution patterns.
+- Ensure that all operations execute efficiently and correctly across multiple devices, including GPUs.
+
+### 5. Documentation and Examples
+
+- Write detailed documentation with examples demonstrating supported operations and various data layouts.
+- Add inline comments within the codebase to clarify the implementation logic and data flow.
+
+# Explicit Processor Mapping of DArray Blocks in Block-Cyclic Manner with `DBCArray`
+
+## Objective
+
+In block-cyclic layouts, block-to-processor assignments are typically handled by the scheduler, which may result in inefficient allocations. The goal here is to explicitly control block allocation by setting a processor "scope" during the block assignment process.
+
+## Approaches
+
+There are two possible approaches:
+
+1. **Integrate directly into `DArray`**: Modify the existing `DArray` structure and core logic to support explicit processor mapping, which requires substantial changes.
+
+2. **Create a new struct `DBCArray`**: Define a new structure extending `DArray`, reusing its features while adding new functions with minimal changes to existing code.
+
+I am currently pursuing the second method as it is modular, less invasive, easier to test and maintain, and can later be integrated into `DArray` if needed.
+
+## Implementation
+
+Development is maintained in my fork of Dagger.jl at [https://github.com/AkhilAkkapelli/Dagger.jl](https://github.com/AkhilAkkapelli/Dagger.jl), with all changes made in the `dbcarray.jl` file under the `src/array` folder. The `DBCArray` implementation incrementally incorporates functionality from various files in the codebase as outlined below. Some functions are not yet implemented/tested and are currently commented out.
+
+### darray.jl
+
+We enhance DArray by introducing a new struct, `DBCArray`, which holds `darray` and `pdomain`. The `pdomain` is an array of `Dagger.Processor` type. Each block in the `darray` is mapped to these processors in a block-cyclic pattern. Below is the `DBCArray` struct definition:
+
+```julia
+# Define DBCArray struct, wrapping a DArray and processor domain
+mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
+    darray::DArray{T,N,B,F}
+    pdomain::AbstractArray{Dagger.Processor, N}
+    # function DBCArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function, pdomain) where {T,N,B,F}
+    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat, pdomain)
+    # end
+end
+```
+
+Other definitions like `DBCVector`, `DBCMatrix` are implemented here similar to `DVector` and `DMatrix`:
+
+```julia
+# Type aliases for convenience
+const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
+const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
+const WrappedDBCVector{T} = WrappedDBCArray{T,1}
+const DBCMatrix{T} = DBCArray{T,2}
+const DBCVector{T} = DBCArray{T,1}
+```
+
+Constructors of `DArray` can be implemented similarly for `DBCArray`. Additionally, a new constructor is defined that takes `darray` and `pdomain` as inputs, and creates a `DBCArray` by assigning blocks to its processors using scope in `@spawn`:
+
+```julia
+# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
+
+# function DArray(T, domain::DArrayDomain{N},
+#     subdomains::AbstractArray{DArrayDomain{N}, N},
+#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
+# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
+# end
+
+# function DArray(T, domain::DArrayDomain{N},
+    #     subdomains::DArrayDomain{N},
+    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
+    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
+    # _subdomains[1] = subdomains
+    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
+    # _chunks[1] = chunks
+    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
+# end
+
+# Constructor for DBCArray from DArray and processor domain
+function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
+    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
+    missing = filter(p -> p ∉ all_procs, pdomain)
+    isempty(missing) || error("Missing processors: $missing")
+
+    Ac = fetch(A.chunks)
+    Ac_copy = similar(A.chunks)
+
+    # Assign blocks to processors using ExactScope
+    for idx in CartesianIndices(A.chunks)
+        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
+        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
+    end
+
+    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
+    return DBCArray{T,N,B,F}(A_copy, pdomain)
+end
+```
+
+The following utility functions are adapted to support the `DBCArray` type:
+
+```julia
+# Delegate various properties to inner DArray
+domain(d::DBCArray) = domain(d.darray)
+chunks(d::DBCArray) = chunks(d.darray)
+domainchunks(d::DBCArray) = domainchunks(d.darray)
+size(x::DBCArray) = size(domain(x))
+stage(ctx, c::DBCArray) = stage(ctx, c.darray)
+
+# processor domain of DBCArray
+pdomain(A::DBCArray) = A.pdomain
+```
+
+The standard Base interface methods `collect`, `wait`, `show`, `similar`, `copy`, `/`, `view`, `fetch`, and `==` are overloaded for `DBCArray`:
+
+```julia
+# Collect method
+function Base.collect(d::DBCArray; tree=false)
+    return collect(d.darray; tree=tree)
+end
+
+# Wait method
+Base.wait(A::DBCArray) = wait(A.darray.chunks)
+
+# Show method
+function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
+    nparts = N > 0 ? size(A.darray.chunks) : 1
+    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
+    nprocs = N > 0 ? size(A.pdomain) : 1
+    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
+    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
+    if pct_complete < 100
+        println(io)
+        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
+    end
+    println(io)
+    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
+end
+
+# Copy method
+Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
+
+# Division method
+Base.:/(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
+
+# View method
+# function Base.view(c::DArray, d)
+#     subchunks, subdomains = lookup_parts(c, chunks(c), domainchunks(c), d)
+#     d1 = alignfirst(d)
+#     DArray(eltype(c), d1, subdomains, subchunks, c.partitioning, c.concat)
+# end
+
+# Fetch method
+# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
+
+# Equality checks
+# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
+#     collect(x) == y
+# end
+# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
+#     return collect(x) == y
+# end
+```
+
+Lastly, `logs_annotate!` for `DBCArray`:
+
+```julia
+# Annotate the logs of DBCArray
+function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
+    for (idx, chunk) in enumerate(A.darray.chunks)
+        sd = A.subdomains[idx]
+        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
+    end
+end
+```
+
+### matrix.jl
+
+The Adjoint and Transpose operations for `DBCArray` have been implemented similarly to `DArray`, ensuring that computations are carried out on the respective processors.
+
+```julia
+# Define adjoint/transpose copy function
+function copydiag(f, A::DBCArray{T, 2}) where T
+    Ac = A.darray.chunks
+    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
+    _copytile(f, Ac) = copy(f(Ac))
+    for idx in CartesianIndices(Ac)
+        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
+        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
+    end
+    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
+    return DBCArray(Ad_copy, A.pdomain)
+end
+
+# Overload fetch, copy, collect for Adjoint/Transpose
+Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
+Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
+Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
+Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
+```
+
+Matrix-vector multiplication and power operations can be done just like `DArray` for distributed `DBCArray`.
+
+```julia
+# Matrix-vector multiplication
+(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
+
+# Power operation
+# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
+```
+
+Other Matrix-Matrix operations like `+` , `*` , `MatMul` , and like `scale`, `Concat`, `cat`, `hcat`, `vcat` are planned for future implementation.
+
+### indexing.jl
+
+Indexing in `DBCArray` is handled by overloading the `getindex` and `setindex!` functions, allowing read and write access just like regular arrays:
+
+```julia
+# Indexing of DBCArray 
+Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
+Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
+
+Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
+Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
+Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
+```
+
+## Usage of `dbcarray.jl`:
+
+First, install the necessary packages and start three more worker processes:
+
+```julia
+import Pkg
+Pkg.add("Distributed"); using Distributed
+addprocs(3)
+Pkg.add(url="https://github.com/AkhilAkkapelli/Dagger.jl.git")
+@everywhere using Dagger
+```
+
+Then, define `pdomain` as a 2×2 matrix of Dagger processors for block-cyclic distribution:
+
+```julia
+@everywhere pdomain = reshape(collect(Dagger.all_processors()), 2, 2)
+```
+
+Create a `DBCArray` using the constructor with a random matrix:
+
+```julia
+A = rand(Blocks(3, 3), 15, 15)
+Adbc = DBCArray(A, pdomain);
+```
+
+Check if the constructed object is of the right type:
+
+```julia
+isa(Adbc, DBCMatrix)
+```
+
+Use the available utility functions to query properties:
+
+```julia
+domain(Adbc)        # Returns the domain of the array
+chunks(Adbc)        # Returns the individual blocks
+domainchunks(Adbc)  # Returns domain-specific chunk mapping
+size(Adbc)          # Returns array size
+pdomain(Adbc)       # Returns processor domain
+```
+
+Use standard Base methods similar to how they work with DArray:
+
+```julia
+collect(Adbc)       # Gather the full array
+Adbc2 = copy(Adbc)  # Make a deep copy
+Adbc3 = Adbc / 3    # Perform element-wise division
+```
+
+Transpose and Adjoint operations also work:
+
+```julia
+Adbc4 = transpose(Adbc)
+Adbc5 = adjoint(Adbc)
+Adbc6 = Adbc'  # Adjoint using shorthand
+```
+
+## Note
+
+To verify which processors each block of a `DBCArray` is assigned to, I used the below code to fetch processor info block-wise:
+
+```julia
+# Print processor mapped to each block in the DBCArray
+chunk_procs = [Dagger.processor(Adbc.darray.chunks[idx].future.future.v.value[2])
+               for idx in CartesianIndices(size(Dagger.domainchunks(Adbc)))]
+```
+
+To log these assignments in detail, including task mappings and dependencies, the following snippet was used:
+
+```julia
+# Activate logging and create a DBCArray to capture block assignments
+using GraphViz
+
+@everywhere Dagger.enable_logging!(
+    taskfuncnames=true, tasknames=true, taskdeps=true, taskargs=true,
+    taskargmoves=true, taskresult=true, timeline=true,
+    all_task_deps=true, taskuidtotid=true, tasktochunk=true)
+
+A = rand(Blocks(3, 3), 15, 15)
+
+Adbc = Dagger.DBCArray(A, pdomain)
+
+@everywhere logs = Dagger.fetch_logs!()
+```
+
+However, the DOT graph generated didn't show processor assignment visually.
+
+## Conclusion
+
+The `DBCArray` extends `DArray` by allowing block-cyclic distribution of data blocks over a given processor layout. Each block is explicitly assigned to a `Dagger.Processor` using `@spawn` with `ExactScope`, giving better control on locality and processor affinity, important for distributed computing performance. It keeps `DArray`'s familiar interface while overloading standard Base methods and adding helper functions for `DBCArray`.&#x20;
+
+Though tools like `fetch_logs!()` show processor assignment, this is not yet reflected visually in DOT graphs. Future work will include adding support for matrix-matrix operations and advanced linear algebra utilities.

From 15090c0e24508fdcf0682307e159d7c09d57884d Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 11:12:33 +0530
Subject: [PATCH 07/36] Update dbcarray.jl

---
 src/array/dbcarray.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array/dbcarray.jl b/src/array/dbcarray.jl
index 822dda903..9c94892e1 100644
--- a/src/array/dbcarray.jl
+++ b/src/array/dbcarray.jl
@@ -1,4 +1,4 @@
-export DBCArray, DBCVector, DBCMatrix, 
+export DBCArray, DBCVector, DBCMatrix
 
 ### darray.jl
 

From feb588d3eaf064d438bdfc8137c86f9308f6577a Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 14:44:21 +0530
Subject: [PATCH 08/36] Update DBCArray.md

---
 reports/DBCArray.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
index eefc581a5..3b2871a06 100644
--- a/reports/DBCArray.md
+++ b/reports/DBCArray.md
@@ -281,7 +281,7 @@ Then, define `pdomain` as a 2×2 matrix of Dagger processors for block-cyclic di
 Create a `DBCArray` using the constructor with a random matrix:
 
 ```julia
-A = rand(Blocks(3, 3), 15, 15)
+A = rand(Blocks(3, 3), 15, 15);
 Adbc = DBCArray(A, pdomain);
 ```
 
@@ -295,10 +295,10 @@ Use the available utility functions to query properties:
 
 ```julia
 domain(Adbc)        # Returns the domain of the array
-chunks(Adbc)        # Returns the individual blocks
-domainchunks(Adbc)  # Returns domain-specific chunk mapping
+Dagger.chunks(Adbc)        # Returns the individual blocks
+Dagger.domainchunks(Adbc)  # Returns domain-specific chunk mapping
 size(Adbc)          # Returns array size
-pdomain(Adbc)       # Returns processor domain
+Dagger.pdomain(Adbc)       # Returns processor domain
 ```
 
 Use standard Base methods similar to how they work with DArray:

From 195a8444d5924ac772457ea7b9ae97351f4538d5 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 19:21:00 +0530
Subject: [PATCH 09/36] Update DBCArray.md

---
 reports/DBCArray.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
index 3b2871a06..a13501a0e 100644
--- a/reports/DBCArray.md
+++ b/reports/DBCArray.md
@@ -343,6 +343,10 @@ A = rand(Blocks(3, 3), 15, 15)
 Adbc = Dagger.DBCArray(A, pdomain)
 
 @everywhere logs = Dagger.fetch_logs!()
+
+open(raw"graph.dot", "w") do io
+    Dagger.show_logs(io, logs, :graphviz; disconnected=true)
+end
 ```
 
 However, the DOT graph generated didn't show processor assignment visually.

From c7bf6a0e49c4d10ad8ac3f3e62c0dbf4f55d7954 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 21 Apr 2025 20:00:48 +0530
Subject: [PATCH 10/36] Update DBCArray.md

---
 reports/DBCArray.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
index a13501a0e..b34381999 100644
--- a/reports/DBCArray.md
+++ b/reports/DBCArray.md
@@ -6,7 +6,7 @@
 
 ## Main Goal
 
-The objective of this project was to add distributed linear algebra capabilities to Dagger.jl. This involved implementing operations such as matrix multiplication and factorizations using various data distribution schemes (cyclic, block-cyclic, 2D, 3D) that can run efficiently across multiple devices using the Dagger.jl APIs.
+The objective of this project is to add distributed linear algebra capabilities to Dagger.jl. This involved implementing operations such as matrix multiplication and factorizations using various data distribution schemes (cyclic, block-cyclic, 2D, 3D) that can run efficiently across multiple devices using the Dagger.jl APIs.
 
 ## Steps Toward Implementation
 

From 3a043a67ca60c14829943ab33f6ac8c78dae3404 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 22 Apr 2025 02:19:26 +0530
Subject: [PATCH 11/36] Update DBCArray.md

---
 reports/DBCArray.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
index b34381999..245c16a74 100644
--- a/reports/DBCArray.md
+++ b/reports/DBCArray.md
@@ -2,7 +2,7 @@
 
 **Author**: Akhil Akkapelli
 
-**Mentor**: Julian Samaroo
+**Mentor**: Julian Samaroo, Rabab Alomairy
 
 ## Main Goal
 

From 5f4911883547c7a1bb75fefa055b7e3d6a2cb49c Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:49:52 +0530
Subject: [PATCH 12/36] Update darray.jl

---
 src/array/darray.jl | 116 ++++++++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 46 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index 37c61a936..e0635b0c6 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -47,10 +47,6 @@ function project(a::ArrayDomain, b::ArrayDomain)
     end |> ArrayDomain
 end
 
-function getindex(a::ArrayDomain, b::ArrayDomain)
-    ArrayDomain(map(getindex, indexes(a), indexes(b)))
-end
-
 """
     alignfirst(a) -> ArrayDomain
 
@@ -148,8 +144,7 @@ const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
 
 # mainly for backwards-compatibility
-DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} =
-    DArray(T, domain, subdomains, chunks, partitioning, concat)
+DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = DArray(T, domain, subdomains, chunks, partitioning, concat)
 
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::AbstractArray{DArrayDomain{N}, N},
@@ -160,6 +155,7 @@ end
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::DArrayDomain{N},
                 chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
+
     _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
     _subdomains[1] = subdomains
     _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
@@ -171,7 +167,7 @@ domain(d::DArray) = d.domain
 chunks(d::DArray) = d.chunks
 domainchunks(d::DArray) = d.subdomains
 size(x::DArray) = size(domain(x))
-stage(ctx, c::DArray) = c
+stage(ctx, c::DArray) =  c
 
 function Base.collect(d::DArray; tree=false)
     a = fetch(d)
@@ -207,6 +203,7 @@ Base.wait(A::DArray) = foreach(wait, A.chunks)
     end
 else
 =#
+
     struct ColorElement{T}
         color::Symbol
         value::Union{Some{T},Nothing}
@@ -317,8 +314,7 @@ function Base.similar(A::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S
     return _to_darray(a)
 end
 
-Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} =
-    map(identity, x)::DArray{T,N,B,F}
+Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} = map(identity, x)::DArray{T,N,B,F}
 
 # Because OrdinaryDiffEq uses `Base.promote_op(/, ::DArray, ::Real)`
 Base.:(/)(x::DArray{T,N,B,F}, y::U) where {T<:Real,U<:Real,N,B,F} =
@@ -419,6 +415,12 @@ struct Distribute{T,N,B<:AbstractBlocks} <: ArrayOp{T, N}
     domainchunks
     partitioning::B
     data::AbstractArray{T,N}
+    pgrid::Union{AbstractArray{<:Processor, N}, Nothing}
+
+    function Distribute(domainchunks, partitioning::B, data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N}, Nothing} = nothing) where {T,N,B<:AbstractBlocks}
+        new{T,N,B}(domainchunks, partitioning, data, pgrid)
+    end
+
 end
 
 size(x::Distribute) = size(domain(x.data))
@@ -426,19 +428,19 @@ size(x::Distribute) = size(domain(x.data))
 Base.@deprecate BlockPartition Blocks
 
 
-Distribute(p::Blocks, data::AbstractArray) =
-    Distribute(partition(p, domain(data)), p, data)
+Distribute(p::Blocks, data::AbstractArray, pgrid::Union{AbstractArray{<:Processor},Nothing} = nothing) =
+    Distribute(partition(p, domain(data)), p, data, pgrid)
 
-function Distribute(domainchunks::DomainBlocks{N}, data::AbstractArray{T,N}) where {T,N}
+function Distribute(domainchunks::DomainBlocks{N}, data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
     p = Blocks(ntuple(i->first(domainchunks.cumlength[i]), N))
-    Distribute(domainchunks, p, data)
+    Distribute(domainchunks, p, data, pgrid)
 end
 
-function Distribute(data::AbstractArray{T,N}) where {T,N}
+function Distribute(data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
     nprocs = sum(w->length(Dagger.get_processors(OSProc(w))),
                  procs())
     p = Blocks(ntuple(i->max(cld(size(data, i), nprocs), 1), N))
-    return Distribute(partition(p, domain(data)), p, data)
+    return Distribute(partition(p, domain(data)), p, data, pgrid)
 end
 
 function stage(ctx::Context, d::Distribute)
@@ -451,11 +453,10 @@ function stage(ctx::Context, d::Distribute)
         Nd = ndims(x)
         T = eltype(d.data)
         concat = x.concat
-        cs = map(d.domainchunks) do idx
+        cs = map(CartesianIndices(d.domainchunks)) do I
+            idx = d.domainchunks[I]
             chunks = stage(ctx, x[idx]).chunks
             shape = size(chunks)
-            # TODO: fix hashing
-            #hash = uhash(idx, Base.hash(Distribute, Base.hash(d.data)))
             Dagger.spawn(shape, chunks...) do shape, parts...
                 if prod(shape) == 0
                     return Array{T}(undef, shape)
@@ -466,12 +467,16 @@ function stage(ctx::Context, d::Distribute)
             end
         end
     else
-        cs = map(d.domainchunks) do c
-            # TODO: fix hashing
-            #hash = uhash(c, Base.hash(Distribute, Base.hash(d.data)))
-            Dagger.@spawn identity(d.data[c])
+        cs = map(CartesianIndices(d.domainchunks)) do I
+            c = d.domainchunks[I]
+            if isnothing(d.pgrid)
+                Dagger.@spawn identity(d.data[c])
+            else
+                Dagger.@spawn scope=ExactScope(d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]) Dagger.tochunk(d.data[c], d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)], ExactScope(d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]))
+            end
         end
     end
+    
     return DArray(eltype(d.data),
                   domain(d.data),
                   d.domainchunks,
@@ -479,6 +484,7 @@ function stage(ctx::Context, d::Distribute)
                   d.partitioning)
 end
 
+
 """
     AutoBlocks
 
@@ -486,37 +492,55 @@ Automatically determines the size and number of blocks for a distributed array.
 This may construct any kind of `Dagger.AbstractBlocks` partitioning.
 """
 struct AutoBlocks end
-function auto_blocks(dims::Dims{N}) where N
-    # TODO: Allow other partitioning schemes
-    np = num_processors()
-    p = N > 0 ? cld(dims[end], np) : 1
-    return Blocks(ntuple(i->i == N ? p : dims[i], N))
-end
+    function auto_blocks(dims::Dims{N}) where N
+        # TODO: Allow other partitioning schemes
+        p = N > 0 ? cld(dims[end], num_processors()) : 1
+        return Blocks(ntuple(i->i == N ? p : dims[i], N))
+    end
+
+
 auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A))
 
-distribute(A::AbstractArray) = distribute(A, AutoBlocks())
-distribute(A::AbstractArray{T,N}, dist::Blocks{N}) where {T,N} =
-    _to_darray(Distribute(dist, A))
-distribute(A::AbstractArray, ::AutoBlocks) = distribute(A, auto_blocks(A))
-function distribute(x::AbstractArray{T,N}, n::NTuple{N}) where {T,N}
+distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}}) = distribute(A, AutoBlocks(); assignment)
+function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}}) where {T,N} 
+    pgrid = nothing
+    if assignment isa Symbol
+        if assignment == :arbitrary
+            pgrid = nothing
+        elseif assignment == :blockcyclic
+            p = ntuple(i -> i == N ? num_processors() : 1, N)
+            pgrid = reshape([only(Dagger.get_processors(OSProc(i))) for i in procs()], p)
+        else
+            error("Unsupported assignment symbol: $assignment, use :arbitrary or :blockcyclic")
+        end
+    elseif assignment isa AbstractArray{<:Int, N}
+        pgrid = [only(Dagger.get_processors(OSProc(assignment[I]))) for I in CartesianIndices(assignment)]
+    elseif assignment isa AbstractArray{<:Processor, N}
+        pgrid = assignment
+    end
+
+    return _to_darray(Distribute(dist, A, pgrid))
+end
+
+distribute(A::AbstractArray, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}}) = distribute(A, auto_blocks(A), assignment)
+function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N}
     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
-    distribute(x, Blocks(p))
+    distribute(x, Blocks(p), assignment)
 end
-distribute(x::AbstractVector, n::Int) = distribute(x, (n,))
-distribute(x::AbstractVector, n::Vector{<:Integer}) =
-    distribute(x, DomainBlocks((1,), (cumsum(n),)))
+distribute(x::AbstractVector, n::Int, assignment::Union{Symbol, AbstractArray{<:Int, 1}, AbstractArray{<:Processor, 1}} = :arbitrary) = distribute(x, (n,), assignment)
+distribute(x::AbstractVector, n::Vector{<:Integer}, assignment::Union{Symbol, AbstractArray{<:Int, 1}, AbstractArray{<:Processor, 1}} = :arbitrary) = distribute(x, DomainBlocks((1,), (cumsum(n),)), assignment)
 
-DVector(A::AbstractVector{T}, part::Blocks{1}) where T = distribute(A, part)
-DMatrix(A::AbstractMatrix{T}, part::Blocks{2}) where T = distribute(A, part)
-DArray(A::AbstractArray{T,N}, part::Blocks{N}) where {T,N} = distribute(A, part)
+DVector(A::AbstractVector{T}, part::Blocks{1}, assignment::Union{Symbol, AbstractArray{<:Int, 1}, AbstractArray{<:Processor, 1}} = :arbitrary) where T = distribute(A, part, assignment)
+DMatrix(A::AbstractMatrix{T}, part::Blocks{2}, assignment::Union{Symbol, AbstractArray{<:Int, 2}, AbstractArray{<:Processor, 2}} = :arbitrary) where T = distribute(A, part, assignment)
+DArray(A::AbstractArray{T,N}, part::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N} = distribute(A, part, assignment)
 
-DVector(A::AbstractVector{T}) where T = DVector(A, AutoBlocks())
-DMatrix(A::AbstractMatrix{T}) where T = DMatrix(A, AutoBlocks())
-DArray(A::AbstractArray) = DArray(A, AutoBlocks())
+DVector(A::AbstractVector{T}, assignment::Union{Symbol, AbstractArray{<:Int, 1}, AbstractArray{<:Processor, 1}} = :arbitrary) where T = DVector(A, AutoBlocks(), assignment)
+DMatrix(A::AbstractMatrix{T}, assignment::Union{Symbol, AbstractArray{<:Int, 2}, AbstractArray{<:Processor, 2}} = :arbitrary) where T = DMatrix(A, AutoBlocks(), assignment)
+DArray(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = DArray(A, AutoBlocks(), assignment) 
 
-DVector(A::AbstractVector{T}, ::AutoBlocks) where T = DVector(A, auto_blocks(A))
-DMatrix(A::AbstractMatrix{T}, ::AutoBlocks) where T = DMatrix(A, auto_blocks(A))
-DArray(A::AbstractArray, ::AutoBlocks) = DArray(A, auto_blocks(A))
+DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int, 1}, AbstractArray{<:Processor, 1}} = :arbitrary) where T = DVector(A, auto_blocks(A), assignment)
+DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int, 2}, AbstractArray{<:Processor, 2}} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment)
+DArray(A::AbstractArray, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = DArray(A, auto_blocks(A), assignment)
 
 function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
     collect(x) == y

From 7bf3428a272419744e4813c52a8cd04d8540c029 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:51:18 +0530
Subject: [PATCH 13/36] Update Dagger.jl

---
 src/Dagger.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Dagger.jl b/src/Dagger.jl
index 013fc2710..a718373df 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -84,7 +84,7 @@ include("stream-transfer.jl")
 
 # Array computations
 include("array/darray.jl")
-include("array/dbcarray.jl")
+#include("array/dbcarray.jl")
 include("array/alloc.jl")
 include("array/map-reduce.jl")
 include("array/copy.jl")

From 42e6c29d019c0181483ef093abdc7e0832a6ec82 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:55:03 +0530
Subject: [PATCH 14/36] Update darray.jl

---
 src/array/darray.jl | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index e0635b0c6..e4469983c 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -47,6 +47,10 @@ function project(a::ArrayDomain, b::ArrayDomain)
     end |> ArrayDomain
 end
 
+function getindex(a::ArrayDomain, b::ArrayDomain)
+    ArrayDomain(map(getindex, indexes(a), indexes(b)))
+end
+
 """
     alignfirst(a) -> ArrayDomain
 
@@ -144,7 +148,8 @@ const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
 
 # mainly for backwards-compatibility
-DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = DArray(T, domain, subdomains, chunks, partitioning, concat)
+DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = 
+    DArray(T, domain, subdomains, chunks, partitioning, concat)
 
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::AbstractArray{DArrayDomain{N}, N},
@@ -155,7 +160,6 @@ end
 function DArray(T, domain::DArrayDomain{N},
                 subdomains::DArrayDomain{N},
                 chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
-
     _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
     _subdomains[1] = subdomains
     _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
@@ -163,11 +167,21 @@ function DArray(T, domain::DArrayDomain{N},
     DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
 end
 
+# function DArray(d::DArray, assignment::Symbol= :arbitrary)
+#     if dist_type == :arbitrary
+#         DArray(d)
+#     elseif dist_type == :blockcyclic
+#         DArray(d)
+#     else
+#         error("Unknown assignment: $assignment, can be :arbitrary or :blockcyclic")
+#     end
+# end
+
 domain(d::DArray) = d.domain
 chunks(d::DArray) = d.chunks
 domainchunks(d::DArray) = d.subdomains
 size(x::DArray) = size(domain(x))
-stage(ctx, c::DArray) =  c
+stage(ctx, c::DArray) = c
 
 function Base.collect(d::DArray; tree=false)
     a = fetch(d)

From 2934e19562a83ec5858fd01b008bf8cf8b1e597e Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Fri, 25 Apr 2025 15:26:32 +0530
Subject: [PATCH 15/36] Update darray.jl

---
 src/array/darray.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index e4469983c..f118935e4 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -486,7 +486,9 @@ function stage(ctx::Context, d::Distribute)
             if isnothing(d.pgrid)
                 Dagger.@spawn identity(d.data[c])
             else
-                Dagger.@spawn scope=ExactScope(d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]) Dagger.tochunk(d.data[c], d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)], ExactScope(d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]))
+                proc =  d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]
+                scope = ExactScope(proc)
+                Dagger.@spawn scope=scope Dagger.tochunk(d.data[c], proc, scope)
             end
         end
     end
@@ -515,8 +517,8 @@ struct AutoBlocks end
 
 auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A))
 
-distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}}) = distribute(A, AutoBlocks(); assignment)
-function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}}) where {T,N} 
+distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, AutoBlocks(); assignment)
+function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N} 
     pgrid = nothing
     if assignment isa Symbol
         if assignment == :arbitrary
@@ -536,7 +538,7 @@ function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Sy
     return _to_darray(Distribute(dist, A, pgrid))
 end
 
-distribute(A::AbstractArray, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}}) = distribute(A, auto_blocks(A), assignment)
+distribute(A::AbstractArray, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, auto_blocks(A), assignment)
 function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N}
     p = map((d, dn)->ceil(Int, d / dn), size(x), n)
     distribute(x, Blocks(p), assignment)

From a96ab02812ba7b5651a35ff43afc8fb4022cd7ea Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Fri, 25 Apr 2025 15:27:39 +0530
Subject: [PATCH 16/36] Update darray.jl

---
 src/array/darray.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index f118935e4..1bbc1bb0c 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -430,11 +430,6 @@ struct Distribute{T,N,B<:AbstractBlocks} <: ArrayOp{T, N}
     partitioning::B
     data::AbstractArray{T,N}
     pgrid::Union{AbstractArray{<:Processor, N}, Nothing}
-
-    function Distribute(domainchunks, partitioning::B, data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N}, Nothing} = nothing) where {T,N,B<:AbstractBlocks}
-        new{T,N,B}(domainchunks, partitioning, data, pgrid)
-    end
-
 end
 
 size(x::Distribute) = size(domain(x.data))

From c2975b719ae3652fcf4ef8eed9f33e5346e97457 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 01:23:26 +0530
Subject: [PATCH 17/36] Update darray.jl

---
 src/array/darray.jl | 74 +++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 39 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index 1bbc1bb0c..c2737daa9 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -148,7 +148,7 @@ const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
 
 # mainly for backwards-compatibility
-DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} = 
+DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} =
     DArray(T, domain, subdomains, chunks, partitioning, concat)
 
 function DArray(T, domain::DArrayDomain{N},
@@ -167,16 +167,6 @@ function DArray(T, domain::DArrayDomain{N},
     DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
 end
 
-# function DArray(d::DArray, assignment::Symbol= :arbitrary)
-#     if dist_type == :arbitrary
-#         DArray(d)
-#     elseif dist_type == :blockcyclic
-#         DArray(d)
-#     else
-#         error("Unknown assignment: $assignment, can be :arbitrary or :blockcyclic")
-#     end
-# end
-
 domain(d::DArray) = d.domain
 chunks(d::DArray) = d.chunks
 domainchunks(d::DArray) = d.subdomains
@@ -217,7 +207,6 @@ Base.wait(A::DArray) = foreach(wait, A.chunks)
     end
 else
 =#
-
     struct ColorElement{T}
         color::Symbol
         value::Union{Some{T},Nothing}
@@ -328,7 +317,8 @@ function Base.similar(A::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S
     return _to_darray(a)
 end
 
-Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} = map(identity, x)::DArray{T,N,B,F}
+Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} =
+    map(identity, x)::DArray{T,N,B,F}
 
 # Because OrdinaryDiffEq uses `Base.promote_op(/, ::DArray, ::Real)`
 Base.:(/)(x::DArray{T,N,B,F}, y::U) where {T<:Real,U<:Real,N,B,F} =
@@ -429,7 +419,7 @@ struct Distribute{T,N,B<:AbstractBlocks} <: ArrayOp{T, N}
     domainchunks
     partitioning::B
     data::AbstractArray{T,N}
-    pgrid::Union{AbstractArray{<:Processor, N}, Nothing}
+    procgrid::Union{AbstractArray{<:Processor, N}, Nothing}
 end
 
 size(x::Distribute) = size(domain(x.data))
@@ -437,19 +427,18 @@ size(x::Distribute) = size(domain(x.data))
 Base.@deprecate BlockPartition Blocks
 
 
-Distribute(p::Blocks, data::AbstractArray, pgrid::Union{AbstractArray{<:Processor},Nothing} = nothing) =
-    Distribute(partition(p, domain(data)), p, data, pgrid)
+Distribute(p::Blocks, data::AbstractArray, procgrid::Union{AbstractArray{<:Processor},Nothing} = nothing) =
+    Distribute(partition(p, domain(data)), p, data, procgrid)
 
-function Distribute(domainchunks::DomainBlocks{N}, data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
+function Distribute(domainchunks::DomainBlocks{N}, data::AbstractArray{T,N}, procgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
     p = Blocks(ntuple(i->first(domainchunks.cumlength[i]), N))
-    Distribute(domainchunks, p, data, pgrid)
+    Distribute(domainchunks, p, data, procgrid)
 end
 
-function Distribute(data::AbstractArray{T,N}, pgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
-    nprocs = sum(w->length(Dagger.get_processors(OSProc(w))),
-                 procs())
+function Distribute(data::AbstractArray{T,N}, procgrid::Union{AbstractArray{<:Processor, N},Nothing} = nothing) where {T,N}
+    nprocs = sum(w->length(get_processors(OSProc(w))),procs())
     p = Blocks(ntuple(i->max(cld(size(data, i), nprocs), 1), N))
-    return Distribute(partition(p, domain(data)), p, data, pgrid)
+    return Distribute(partition(p, domain(data)), p, data, procgrid)
 end
 
 function stage(ctx::Context, d::Distribute)
@@ -466,6 +455,8 @@ function stage(ctx::Context, d::Distribute)
             idx = d.domainchunks[I]
             chunks = stage(ctx, x[idx]).chunks
             shape = size(chunks)
+            # TODO: fix hashing
+            #hash = uhash(idx, Base.hash(Distribute, Base.hash(d.data)))
             Dagger.spawn(shape, chunks...) do shape, parts...
                 if prod(shape) == 0
                     return Array{T}(undef, shape)
@@ -477,11 +468,13 @@ function stage(ctx::Context, d::Distribute)
         end
     else
         cs = map(CartesianIndices(d.domainchunks)) do I
+            # TODO: fix hashing
+            #hash = uhash(c, Base.hash(Distribute, Base.hash(d.data)))
             c = d.domainchunks[I]
-            if isnothing(d.pgrid)
+            if isnothing(d.procgrid)
                 Dagger.@spawn identity(d.data[c])
             else
-                proc =  d.pgrid[CartesianIndex(mod1.(Tuple(I), size(d.pgrid))...)]
+                proc =  d.procgrid[CartesianIndex(mod1.(Tuple(I), size(d.procgrid))...)]
                 scope = ExactScope(proc)
                 Dagger.@spawn scope=scope Dagger.tochunk(d.data[c], proc, scope)
             end
@@ -495,7 +488,6 @@ function stage(ctx::Context, d::Distribute)
                   d.partitioning)
 end
 
-
 """
     AutoBlocks
 
@@ -503,34 +495,38 @@ Automatically determines the size and number of blocks for a distributed array.
 This may construct any kind of `Dagger.AbstractBlocks` partitioning.
 """
 struct AutoBlocks end
-    function auto_blocks(dims::Dims{N}) where N
-        # TODO: Allow other partitioning schemes
-        p = N > 0 ? cld(dims[end], num_processors()) : 1
-        return Blocks(ntuple(i->i == N ? p : dims[i], N))
-    end
-
-
+function auto_blocks(dims::Dims{N}) where N
+    # TODO: Allow other partitioning schemes
+    np = num_processors()
+    p = N > 0 ? cld(dims[end], np) : 1
+    return Blocks(ntuple(i->i == N ? p : dims[i], N))
+end
 auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A))
 
-distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, AutoBlocks(); assignment)
+distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, AutoBlocks(), assignment)
 function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N} 
-    pgrid = nothing
+    procgrid = nothing
     if assignment isa Symbol
         if assignment == :arbitrary
-            pgrid = nothing
+            procgrid = nothing
         elseif assignment == :blockcyclic
             p = ntuple(i -> i == N ? num_processors() : 1, N)
-            pgrid = reshape([only(Dagger.get_processors(OSProc(i))) for i in procs()], p)
+            procgrid = reshape([proc for i in procs() for proc in get_processors(OSProc(i))], p)
         else
             error("Unsupported assignment symbol: $assignment, use :arbitrary or :blockcyclic")
         end
     elseif assignment isa AbstractArray{<:Int, N}
-        pgrid = [only(Dagger.get_processors(OSProc(assignment[I]))) for I in CartesianIndices(assignment)]
+        missingprocs = filter(p -> p ∉ procs(), assignment)
+        isempty(missingprocs) || error("Missing processors: $missingprocs")
+        procgrid = [first(get_processors(OSProc(proc))) for proc in assignment]
     elseif assignment isa AbstractArray{<:Processor, N}
-        pgrid = assignment
+        availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
+        missingprocs = filter(p -> p ∉ availprocs, assignment)
+        isempty(missingprocs) || error("Missing processors: $missingprocs")
+        procgrid = assignment
     end
 
-    return _to_darray(Distribute(dist, A, pgrid))
+    return _to_darray(Distribute(dist, A, procgrid))
 end
 
 distribute(A::AbstractArray, ::AutoBlocks, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, auto_blocks(A), assignment)

From e4dcb87f818683dbe1b70f2ff9d2a21d19324bf5 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 01:23:51 +0530
Subject: [PATCH 18/36] Update Dagger.jl

---
 src/Dagger.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Dagger.jl b/src/Dagger.jl
index a718373df..fd6395a4b 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -84,7 +84,6 @@ include("stream-transfer.jl")
 
 # Array computations
 include("array/darray.jl")
-#include("array/dbcarray.jl")
 include("array/alloc.jl")
 include("array/map-reduce.jl")
 include("array/copy.jl")

From 0b0ca0f9283e030e0d2fb3e175fb3102e149e2e0 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 01:25:45 +0530
Subject: [PATCH 19/36] Delete src/array/dbcarray.jl

---
 src/array/dbcarray.jl | 167 ------------------------------------------
 1 file changed, 167 deletions(-)
 delete mode 100644 src/array/dbcarray.jl

diff --git a/src/array/dbcarray.jl b/src/array/dbcarray.jl
deleted file mode 100644
index 9c94892e1..000000000
--- a/src/array/dbcarray.jl
+++ /dev/null
@@ -1,167 +0,0 @@
-export DBCArray, DBCVector, DBCMatrix
-
-### darray.jl
-
-mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
-    darray::DArray{T,N,B,F}
-    pdomain::AbstractArray{Dagger.Processor, N}
-    # function DBCArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function, pdomain) where {T,N,B,F}
-    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat, pdomain)
-    # end
-end
-
-const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
-const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
-const WrappedDBCVector{T} = WrappedDBCArray{T,1}
-const DBCMatrix{T} = DBCArray{T,2}
-const DBCVector{T} = DBCArray{T,1}
-
-# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
-
-# function DArray(T, domain::DArrayDomain{N},
-#     subdomains::AbstractArray{DArrayDomain{N}, N},
-#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
-# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
-# end
-
-# function DArray(T, domain::DArrayDomain{N},
-    #     subdomains::DArrayDomain{N},
-    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
-    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
-    # _subdomains[1] = subdomains
-    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
-    # _chunks[1] = chunks
-    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
-# end
-
-function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
-    
-    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
-    missing = filter(p -> p ∉ all_procs, pdomain)
-    isempty(missing) || error("Missing processors: $missing")
-    
-    Ac = fetch(A.chunks)
-    Ac_copy = similar(A.chunks)
-
-    for idx in CartesianIndices(A.chunks)
-        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
-        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
-        # new_chunks[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) Dagger.tochunk(old_chunks[idx], proc)
-    end
-    
-    # Construct new DArray with updated chunks
-    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
-
-    return DBCArray{T,N,B,F}(A_copy, pdomain)
-end
-
-domain(d::DBCArray) = domain(d.darray)
-chunks(d::DBCArray) = chunks(d.darray)
-domainchunks(d::DBCArray) = domainchunks(d.darray)
-size(x::DBCArray) = size(domain(x))
-stage(ctx, c::DBCArray) = stage(ctx, c.darray)
-pdomain(A::DBCArray) = A.pdomain
-
-function Base.collect(d::DBCArray; tree=false)
-    return collect(d.darray; tree=tree)
-end
-
-Base.wait(A::DBCArray) = wait(A.darray.chunks)
-
-function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
-    nparts = N > 0 ? size(A.darray.chunks) : 1
-    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
-    nprocs = N > 0 ? size(A.pdomain) : 1
-    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
-    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
-    if pct_complete < 100
-        println(io)
-        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
-    end
-    println(io)
-    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
-end
-
-# function Base.similar(A::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N}
-#     d = ArrayDomain(map(x->1:x, dims))
-#     p = A.partitioning
-#     a = AllocateArray(S, AllocateUndef{S}(), false, d, partition(p, d), p)
-#     return _to_darray(a)
-# end
-
-Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
-
-Base.:(/)(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
-
-# function Base.view(c::DArray, d)
-#     subchunks, subdomains = lookup_parts(c, chunks(c), domainchunks(c), d)
-#     d1 = alignfirst(d)
-#     DArray(eltype(c), d1, subdomains, subchunks, c.partitioning, c.concat)
-# end
-
-# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
-
-# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
-#     collect(x) == y
-# end
-
-# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
-#     return collect(x) == y
-# end
-
-function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
-    for (idx, chunk) in enumerate(A.darray.chunks)
-        sd = A.subdomains[idx]
-        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
-    end
-end
-
-# function mapchunks(f, d::DArray{T,N,F}) where {T,N,F}
-#     chunks = map(d.chunks) do chunk
-#         owner = get_parent(chunk.processor).pid
-#         remotecall_fetch(mapchunk, owner, f, chunk)
-#     end
-#     DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat)
-# end
-
-
-### matrix.jl
-
-function copydiag(f, A::DBCArray{T, 2}) where T
-    Ac = A.darray.chunks
-    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
-    _copytile(f, Ac) = copy(f(Ac))
-    for idx in CartesianIndices(Ac)
-        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
-        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
-    end
-    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
-    return DBCArray(Ad_copy, A.pdomain)
-end
-
-Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
-Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
-Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-
-(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
-
-# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
-
-
-# indexing.jl
-
-Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
-
-Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
-
-Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
-Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)

From 207cf4c2ef37a6ce9bef3243e63d49de47f9dd4f Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 01:29:26 +0530
Subject: [PATCH 20/36] Delete reports directory

---
 reports/DBCArray.md | 358 --------------------------------------------
 1 file changed, 358 deletions(-)
 delete mode 100644 reports/DBCArray.md

diff --git a/reports/DBCArray.md b/reports/DBCArray.md
deleted file mode 100644
index 245c16a74..000000000
--- a/reports/DBCArray.md
+++ /dev/null
@@ -1,358 +0,0 @@
-# GSoC 2025 Report: Distributed Linear Algebra with Dagger.jl
-
-**Author**: Akhil Akkapelli
-
-**Mentor**: Julian Samaroo, Rabab Alomairy
-
-## Main Goal
-
-The objective of this project is to add distributed linear algebra capabilities to Dagger.jl. This involved implementing operations such as matrix multiplication and factorizations using various data distribution schemes (cyclic, block-cyclic, 2D, 3D) that can run efficiently across multiple devices using the Dagger.jl APIs.
-
-## Steps Toward Implementation
-
-### 1. Background Study and Design
-
-- Study the Dagger.jl documentation to understand its architecture and design principles.
-- Explore relevant source code files and become familiar with the internal mechanisms and how different components interact.
-
-### 2. Matrix Distribution Infrastructure
-
-- Develop a system to distribute `DArray` chunks using a block-cyclic layout to specific processor blocks.
-- Introduce new constructors and helper functions to ensure that each chunk runs exclusively on its assigned processor.
-- Update the scheduling logic to maintain fixed processor scopes during execution.
-- Integrate the distribution logic with Dagger's scheduler to manage task dependencies and execute operations correctly across the processor grid.
-
-### 3. Matrix Operations
-
-- Implement fundamental matrix operations such as `Adjoint`, `Transpose`, `*`, `+`, and `MatMul`, ensuring that operations respect processor assignments.
-- Add support for essential linear algebra routines like `norm2`, `issymmetric`, `ishermitian`, and factorizations including `lu` (with and without pivoting) and `cholesky`.
-- Ensure accurate indexing and chunk-to-processor mapping throughout execution.
-
-### 4. Testing and Performance Evaluation
-
-- Develop a comprehensive test suite to validate the correctness of operations for various matrix shapes, sizes, and distribution patterns.
-- Ensure that all operations execute efficiently and correctly across multiple devices, including GPUs.
-
-### 5. Documentation and Examples
-
-- Write detailed documentation with examples demonstrating supported operations and various data layouts.
-- Add inline comments within the codebase to clarify the implementation logic and data flow.
-
-# Explicit Processor Mapping of DArray Blocks in Block-Cyclic Manner with `DBCArray`
-
-## Objective
-
-In block-cyclic layouts, block-to-processor assignments are typically handled by the scheduler, which may result in inefficient allocations. The goal here is to explicitly control block allocation by setting a processor "scope" during the block assignment process.
-
-## Approaches
-
-There are two possible approaches:
-
-1. **Integrate directly into `DArray`**: Modify the existing `DArray` structure and core logic to support explicit processor mapping, which requires substantial changes.
-
-2. **Create a new struct `DBCArray`**: Define a new structure extending `DArray`, reusing its features while adding new functions with minimal changes to existing code.
-
-I am currently pursuing the second method as it is modular, less invasive, easier to test and maintain, and can later be integrated into `DArray` if needed.
-
-## Implementation
-
-Development is maintained in my fork of Dagger.jl at [https://github.com/AkhilAkkapelli/Dagger.jl](https://github.com/AkhilAkkapelli/Dagger.jl), with all changes made in the `dbcarray.jl` file under the `src/array` folder. The `DBCArray` implementation incrementally incorporates functionality from various files in the codebase as outlined below. Some functions are not yet implemented/tested and are currently commented out.
-
-### darray.jl
-
-We enhance DArray by introducing a new struct, `DBCArray`, which holds `darray` and `pdomain`. The `pdomain` is an array of `Dagger.Processor` type. Each block in the `darray` is mapped to these processors in a block-cyclic pattern. Below is the `DBCArray` struct definition:
-
-```julia
-# Define DBCArray struct, wrapping a DArray and processor domain
-mutable struct DBCArray{T,N,B,F} <: ArrayOp{T, N}
-    darray::DArray{T,N,B,F}
-    pdomain::AbstractArray{Dagger.Processor, N}
-    # function DBCArray{T,N,B,F}(domain, subdomains, chunks, partitioning::B, concat::Function, pdomain) where {T,N,B,F}
-    #     new{T,N,B,F}(domain, subdomains, chunks, partitioning, concat, pdomain)
-    # end
-end
-```
-
-Other definitions like `DBCVector`, `DBCMatrix` are implemented here similar to `DVector` and `DMatrix`:
-
-```julia
-# Type aliases for convenience
-const WrappedDBCArray{T,N} = Union{<:DBCArray{T,N}, Transpose{<:DBCArray{T,N}}, Adjoint{<:DBCArray{T,N}}}
-const WrappedDBCMatrix{T} = WrappedDBCArray{T,2}
-const WrappedDBCVector{T} = WrappedDBCArray{T,1}
-const DBCMatrix{T} = DBCArray{T,2}
-const DBCVector{T} = DBCArray{T,1}
-```
-
-Constructors of `DArray` can be implemented similarly for `DBCArray`. Additionally, a new constructor is defined that takes `darray` and `pdomain` as inputs, and creates a `DBCArray` by assigning blocks to its processors using scope in `@spawn`:
-
-```julia
-# DBCArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat, pdomain) where {T,N} = DBCArray(T, domain, subdomains, chunks, partitioning, concat)
-
-# function DArray(T, domain::DArrayDomain{N},
-#     subdomains::AbstractArray{DArrayDomain{N}, N},
-#     chunks::AbstractArray{<:Any, N}, partitioning::B, concat=cat) where {N,B<:AbstractBlocks{N}}
-# DArray{T,N,B,typeof(concat)}(domain, subdomains, chunks, partitioning, concat)
-# end
-
-# function DArray(T, domain::DArrayDomain{N},
-    #     subdomains::DArrayDomain{N},
-    #     chunks::Any, partitioning::B, concat=cat) where {N,B<:AbstractSingleBlocks{N}}
-    # _subdomains = Array{DArrayDomain{N}, N}(undef, ntuple(i->1, N)...)
-    # _subdomains[1] = subdomains
-    # _chunks = Array{Any, N}(undef, ntuple(i->1, N)...)
-    # _chunks[1] = chunks
-    # DArray{T,N,B,typeof(concat)}(domain, _subdomains, _chunks, partitioning, concat)
-# end
-
-# Constructor for DBCArray from DArray and processor domain
-function DBCArray(A::DArray{T,N,B,F}, pdomain::AbstractArray{Dagger.Processor, N}) where {T,N,B,F}
-    all_procs = collect(Iterators.flatten(Dagger.get_processors(OSProc(w)) for w in procs()))
-    missing = filter(p -> p ∉ all_procs, pdomain)
-    isempty(missing) || error("Missing processors: $missing")
-
-    Ac = fetch(A.chunks)
-    Ac_copy = similar(A.chunks)
-
-    # Assign blocks to processors using ExactScope
-    for idx in CartesianIndices(A.chunks)
-        proc = pdomain[mod1.(Tuple(idx), size(pdomain))...]
-        Ac_copy[idx] = Dagger.@spawn scope=Dagger.ExactScope(proc) identity(Ac[idx])
-    end
-
-    A_copy = DArray{T,N,B,F}(A.domain, A.subdomains, Ac_copy, A.partitioning, A.concat)
-    return DBCArray{T,N,B,F}(A_copy, pdomain)
-end
-```
-
-The following utility functions are adapted to support the `DBCArray` type:
-
-```julia
-# Delegate various properties to inner DArray
-domain(d::DBCArray) = domain(d.darray)
-chunks(d::DBCArray) = chunks(d.darray)
-domainchunks(d::DBCArray) = domainchunks(d.darray)
-size(x::DBCArray) = size(domain(x))
-stage(ctx, c::DBCArray) = stage(ctx, c.darray)
-
-# processor domain of DBCArray
-pdomain(A::DBCArray) = A.pdomain
-```
-
-The standard Base interface methods `collect`, `wait`, `show`, `similar`, `copy`, `/`, `view`, `fetch`, and `==` are overloaded for `DBCArray`:
-
-```julia
-# Collect method
-function Base.collect(d::DBCArray; tree=false)
-    return collect(d.darray; tree=tree)
-end
-
-# Wait method
-Base.wait(A::DBCArray) = wait(A.darray.chunks)
-
-# Show method
-function Base.show(io::IO, ::MIME"text/plain", A::DBCArray{T,N,B,F}) where {T,N,B,F}
-    nparts = N > 0 ? size(A.darray.chunks) : 1
-    partsize = N > 0 ? A.darray.partitioning.blocksize : 1
-    nprocs = N > 0 ? size(A.pdomain) : 1
-    write(io, " with $(join(nparts, 'x')) partitions of size $(join(partsize, 'x')) distributed to $(join(nprocs, 'x')) processors:")
-    pct_complete = 100 * (sum(c->c isa Chunk ? true : isready(c), A.darray.chunks) / length(A.darray.chunks))
-    if pct_complete < 100
-        println(io)
-        printstyled(io, "~$(round(Int, pct_complete))% completed"; color=:yellow)
-    end
-    println(io)
-    Base.print_array(IOContext(io, :compact=>true), ColorArray(A.darray))
-end
-
-# Copy method
-Base.copy(x::DBCArray{T,N,B,F}) where {T,N,B,F} =  DBCArray{T,N,B,F}(x.darray, x.pdomain)
-
-# Division method
-Base.:/(x::DBCArray{T,N,B,F}, y::U) where {T<:Real, U<:Real, N, B, F} = DBCArray(x.darray / y, x.pdomain)
-
-# View method
-# function Base.view(c::DArray, d)
-#     subchunks, subdomains = lookup_parts(c, chunks(c), domainchunks(c), d)
-#     d1 = alignfirst(d)
-#     DArray(eltype(c), d1, subdomains, subchunks, c.partitioning, c.concat)
-# end
-
-# Fetch method
-# Base.fetch(c::DBCArray{T,N,B,F}) where {T,N,B,F} = c
-
-# Equality checks
-# function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
-#     collect(x) == y
-# end
-# function Base.:(==)(x::AbstractArray{T,N}, y::ArrayOp{S,N}) where {T,S,N}
-#     return collect(x) == y
-# end
-```
-
-Lastly, `logs_annotate!` for `DBCArray`:
-
-```julia
-# Annotate the logs of DBCArray
-function logs_annotate!(ctx::Context, A::DBCArray, name::Union{String,Symbol})
-    for (idx, chunk) in enumerate(A.darray.chunks)
-        sd = A.subdomains[idx]
-        Dagger.logs_annotate!(ctx, chunk, name*'['*join(sd.indexes, ',')*']')
-    end
-end
-```
-
-### matrix.jl
-
-The Adjoint and Transpose operations for `DBCArray` have been implemented similarly to `DArray`, ensuring that computations are carried out on the respective processors.
-
-```julia
-# Define adjoint/transpose copy function
-function copydiag(f, A::DBCArray{T, 2}) where T
-    Ac = A.darray.chunks
-    Ac_copy = Matrix{Any}(undef, size(Ac, 2), size(Ac, 1))
-    _copytile(f, Ac) = copy(f(Ac))
-    for idx in CartesianIndices(Ac)
-        proc = A.pdomain[mod1.(Tuple(idx), size(A.pdomain))...]
-        Ac_copy[idx'] = Dagger.@spawn scope=Dagger.ExactScope(proc) _copytile(f, Ac[idx])
-    end
-    Ad_copy = DArray{T,N,B,F}(ArrayDomain(1:size(A,2), 1:size(A,1)), A.darray.subdomains', Ac_copy, A.darray.partitioning, A.darray.concat)
-    return DBCArray(Ad_copy, A.pdomain)
-end
-
-# Overload fetch, copy, collect for Adjoint/Transpose
-Base.fetch(A::Adjoint{T, <:DBCArray{T, 2}}) where T = copydiag(Adjoint, parent(A))
-Base.fetch(A::Transpose{T, <:DBCArray{T, 2}}) where T = copydiag(Transpose, parent(A))
-Base.copy(A::Adjoint{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.copy(A::Transpose{T, <:DBCArray{T, 2}}) where T = fetch(A)
-Base.collect(A::Adjoint{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-Base.collect(A::Transpose{T, <:DBCArray{T, 2}}) where T = collect(copy(A))
-```
-
-Matrix-vector multiplication and power operations can be done just like `DArray` for distributed `DBCArray`.
-
-```julia
-# Matrix-vector multiplication
-(*)(a::DBCArray, b::Vector) = DBCArray((a.darray)*b, a.pdomain)
-
-# Power operation
-# Base.power_by_squaring(x::DBCArray{T,N,B,F}, i::Int) where {T,N,B,F} = foldl(*, ntuple(_ -> x, i))
-```
-
-Other Matrix-Matrix operations like `+` , `*` , `MatMul` , and like `scale`, `Concat`, `cat`, `hcat`, `vcat` are planned for future implementation.
-
-### indexing.jl
-
-Indexing in `DBCArray` is handled by overloading the `getindex` and `setindex!` functions, allowing read and write access just like regular arrays:
-
-```julia
-# Indexing of DBCArray 
-Base.getindex(A::DBCArray{T,N}, idx::NTuple{N,Int}) where {T,N} = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::Integer...) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::Integer) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray, idx::CartesianIndex) = getindex(A.darray, idx)
-Base.getindex(A::DBCArray{T,N}, idxs::Dims{S}) where {T,N,S} = getindex(A.darray, idxs)
-
-Base.setindex!(A::DBCArray{T,N}, value, idx::NTuple{N,Int}) where {T,N} = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray, value, idx::Integer...) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DArray, value, idx::Integer) = setindex!(A.darray, value,idx)
-Base.setindex!(A::DArray, value, idx::CartesianIndex) = setindex!(A.darray, value, idx)
-Base.setindex!(A::DBCArray{T,N}, value, idxs::Dims{S}) where {T,N,S} = setindex!(A.darray, value, idxs)
-```
-
-## Usage of `dbcarray.jl`:
-
-First, install the necessary packages and start three more worker processes:
-
-```julia
-import Pkg
-Pkg.add("Distributed"); using Distributed
-addprocs(3)
-Pkg.add(url="https://github.com/AkhilAkkapelli/Dagger.jl.git")
-@everywhere using Dagger
-```
-
-Then, define `pdomain` as a 2×2 matrix of Dagger processors for block-cyclic distribution:
-
-```julia
-@everywhere pdomain = reshape(collect(Dagger.all_processors()), 2, 2)
-```
-
-Create a `DBCArray` using the constructor with a random matrix:
-
-```julia
-A = rand(Blocks(3, 3), 15, 15);
-Adbc = DBCArray(A, pdomain);
-```
-
-Check if the constructed object is of the right type:
-
-```julia
-isa(Adbc, DBCMatrix)
-```
-
-Use the available utility functions to query properties:
-
-```julia
-domain(Adbc)        # Returns the domain of the array
-Dagger.chunks(Adbc)        # Returns the individual blocks
-Dagger.domainchunks(Adbc)  # Returns domain-specific chunk mapping
-size(Adbc)          # Returns array size
-Dagger.pdomain(Adbc)       # Returns processor domain
-```
-
-Use standard Base methods similar to how they work with DArray:
-
-```julia
-collect(Adbc)       # Gather the full array
-Adbc2 = copy(Adbc)  # Make a deep copy
-Adbc3 = Adbc / 3    # Perform element-wise division
-```
-
-Transpose and Adjoint operations also work:
-
-```julia
-Adbc4 = transpose(Adbc)
-Adbc5 = adjoint(Adbc)
-Adbc6 = Adbc'  # Adjoint using shorthand
-```
-
-## Note
-
-To verify which processors each block of a `DBCArray` is assigned to, I used the below code to fetch processor info block-wise:
-
-```julia
-# Print processor mapped to each block in the DBCArray
-chunk_procs = [Dagger.processor(Adbc.darray.chunks[idx].future.future.v.value[2])
-               for idx in CartesianIndices(size(Dagger.domainchunks(Adbc)))]
-```
-
-To log these assignments in detail, including task mappings and dependencies, the following snippet was used:
-
-```julia
-# Activate logging and create a DBCArray to capture block assignments
-using GraphViz
-
-@everywhere Dagger.enable_logging!(
-    taskfuncnames=true, tasknames=true, taskdeps=true, taskargs=true,
-    taskargmoves=true, taskresult=true, timeline=true,
-    all_task_deps=true, taskuidtotid=true, tasktochunk=true)
-
-A = rand(Blocks(3, 3), 15, 15)
-
-Adbc = Dagger.DBCArray(A, pdomain)
-
-@everywhere logs = Dagger.fetch_logs!()
-
-open(raw"graph.dot", "w") do io
-    Dagger.show_logs(io, logs, :graphviz; disconnected=true)
-end
-```
-
-However, the DOT graph generated didn't show processor assignment visually.
-
-## Conclusion
-
-The `DBCArray` extends `DArray` by allowing block-cyclic distribution of data blocks over a given processor layout. Each block is explicitly assigned to a `Dagger.Processor` using `@spawn` with `ExactScope`, giving better control on locality and processor affinity, important for distributed computing performance. It keeps `DArray`'s familiar interface while overloading standard Base methods and adding helper functions for `DBCArray`.&#x20;
-
-Though tools like `fetch_logs!()` show processor assignment, this is not yet reflected visually in DOT graphs. Future work will include adding support for matrix-matrix operations and advanced linear algebra utilities.

From 4887abd55524dca79698f4d4eedb4f7dd829e6b4 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 03:29:30 +0530
Subject: [PATCH 21/36] Update darray.jl

---
 src/array/darray.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index c2737daa9..613dcfc59 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -511,14 +511,16 @@ function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Sy
             procgrid = nothing
         elseif assignment == :blockcyclic
             p = ntuple(i -> i == N ? num_processors() : 1, N)
-            procgrid = reshape([proc for i in procs() for proc in get_processors(OSProc(i))], p)
+            availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
+            sortedavailprocs = sort!(availprocs, by = x -> (x.owner, x.tid)) 
+            procgrid = reshape(sortedavailprocs, p)
         else
             error("Unsupported assignment symbol: $assignment, use :arbitrary or :blockcyclic")
         end
     elseif assignment isa AbstractArray{<:Int, N}
         missingprocs = filter(p -> p ∉ procs(), assignment)
         isempty(missingprocs) || error("Missing processors: $missingprocs")
-        procgrid = [first(get_processors(OSProc(proc))) for proc in assignment]
+        procgrid = [Dagger.ThreadProc(proc, 1) for proc in assignment]
     elseif assignment isa AbstractArray{<:Processor, N}
         availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
         missingprocs = filter(p -> p ∉ availprocs, assignment)

From fd45316bc924dcf13573d42fa44ec351119c0b39 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Sat, 26 Apr 2025 19:18:23 +0530
Subject: [PATCH 22/36] Update darray.md

---
 docs/src/darray.md | 159 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)

diff --git a/docs/src/darray.md b/docs/src/darray.md
index 715a6cbe8..d127615ed 100644
--- a/docs/src/darray.md
+++ b/docs/src/darray.md
@@ -211,6 +211,165 @@ across the workers in the Julia cluster in a relatively even distribution;
 future operations on a `DArray` may produce a different distribution from the
 one chosen by previous calls.
 
+<!--  -->
+
+### Explicit Processor Mapping of DArray Blocks
+
+This feature allows you to control how `DArray` blocks (chunks) are assigned to specific processors or threads within the cluster. Fine-grained control over data locality can be crucial for optimizing the performance of certain distributed algorithms.
+
+You specify the mapping using the optional `assignment` keyword argument in the `DArray` constructor functions (`DArray`, `DVector`, and `DMatrix`) and the `distribute` function.
+
+The `assignment` argument accepts the following values:
+
+* `:arbitrary` (Default):
+
+    * If `assignment` is not provided or is set to symbol `:arbitrary`, Dagger's scheduler assigns blocks to processors automatically. This is the default behavior.
+* `:blockcyclic`:
+
+    * If `assignment` is set to `:blockcyclic`, `DArray` blocks are assigned to processors in a block-cyclic manner. Blocks are distributed cyclically across processors, iterating through the processors in increasing rank along the *last* dimension of the block distribution.
+    * Any other symbol used for `assignment` results in an error.
+* `AbstractArray{<:Int, N}`:
+
+    * Provide an N-dimensional array of integer worker IDs. The dimension `N` must match the number of dimensions of the `DArray`.
+    * Dagger maps blocks to worker IDs in a block-cyclic manner. The block at index `(i, j, ...)` is assigned to the first thread of the processor with ID `assignment[i, j, ...]`. This pattern repeats in a block-cyclic fashion to assign all blocks.
+* `AbstractArray{<:Processor, N}`:
+
+    * Provide an N-dimensional array of `Processor` objects. The dimension `N` must match the number of dimensions of the `DArray` blocks.
+    * Blocks are mapped in a block-cyclic manner according to the `Processor` objects in the `assignment` array. The block at index `(i, j, ...)` is assigned to the processor at `assignment[i, j, ...]`. This pattern repeats in a block-cyclic fashion to assign all blocks.
+
+####   Examples and Usage
+
+The `assignment` argument works similarly for `DArray`, `DVector`, and `DMatrix`, as well as the `distribute` function. The key difference lies in the dimensionality of the resulting distributed array:
+
+* `DArray`: For N-dimensional distributed arrays.
+
+* `DVector`: Specifically for 1-dimensional distributed arrays.
+
+* `DMatrix`: Specifically for 2-dimensional distributed arrays.
+
+* `distribute`: General function to distribute arrays.
+
+Here are some examples using a setup with one processor and three worker processors.
+
+First, let's create some sample arrays:
+
+```julia
+A = rand(7, 11)   # 2D array
+v = rand(15)      # 1D array
+M = rand(5, 5, 5) # 3D array
+```
+
+1.  **Arbitrary Assignment:**
+
+    ```julia
+    Ad = distribute(A, Blocks(2, 2), :arbitrary)
+    # DMatrix(A, Blocks(2, 2), :arbitrary)
+
+    vd = distribute(v, Blocks(3), :arbitrary) 
+    # DVector(v, Blocks(3), :arbitrary)
+    
+    Md = distribute(M, Blocks(2, 2, 2), :arbitrary) 
+    # DArray(M, Blocks(2,2,2), :arbitrary)
+    ```
+
+    This creates distributed arrays with the specified block sizes, and Dagger assigns the blocks to processors arbitrarily. For example, the assignment for `Ad` might look like this:
+
+    ```julia
+    4×6 Matrix{Dagger.ThreadProc}:
+      ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(3, 1)  ThreadProc(2, 1) ThreadProc(4, 1)  ThreadProc(3, 1)
+      ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(2, 1)
+      ThreadProc(2, 1)  ThreadProc(2, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(4, 1)
+      ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)
+    
+    ```
+
+2.  **Block-Cyclic Assignment:**
+
+    ```julia
+    Ad = distribute(A, Blocks(2, 2), :blockcyclic) 
+    # DMatrix(A, Blocks(2, 2), :blockcyclic)
+    
+    vd = distribute(v, Blocks(3), :blockcyclic) 
+    # DVector(v, Blocks(3), :blockcyclic)
+
+    Md = distribute(M, Blocks(2, 2, 2), :blockcyclic) 
+    # DArray(M, Blocks(2,2,2), :blockcyclic)
+    ```
+
+    This assigns blocks cyclically along the last dimension across the available processors with increasing rank.  For the 2D case (`Ad`), the assignment will look like this:
+
+    ```julia
+    4×6 Matrix{Dagger.ThreadProc}:
+      ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)
+      ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)
+      ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)
+      ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)
+    
+    ```
+
+3.  **Block-Cyclic Assignment with Integer Array:**
+
+    ```julia
+    assignment_2d = [3 1; 4 2]
+    Ad = distribute(A, Blocks(2, 2), assignment_2d) 
+    # DMatrix(A, Blocks(2, 2), [3 1; 4 2])
+    
+    assignment_1d = [2,3,1,4]
+    vd = distribute(v, Blocks(3), assignment_1d) 
+    # DVector(v, Blocks(3), [2,3,1,4])
+    
+    assignment_3d = cat([1 2; 3 4], [4 3; 2 1], dims=3)
+    Md = distribute(M, Blocks(2, 2, 2), assignment_3d) 
+    # DArray(M, Blocks(2, 2, 2), cat([1 2; 3 4], [4 3; 2 1], dims=3))
+    
+    ```
+
+    Here, the assignment arrays define how processors are arranged.  For example, `assignment_2d` creates a 2x2 processor grid for the 2D array.
+
+    The assignment for `Ad` would be:
+
+    ```julia
+    4×6 Matrix{Dagger.ThreadProc}:
+      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
+      ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)
+      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
+      ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)
+    
+    ```
+
+4.  **Block-Cyclic Assignment with Processor Array:**
+
+    ```julia
+    assignment_2d = [Dagger.ThreadProc(3, 1) Dagger.ThreadProc(1, 1);
+                     Dagger.ThreadProc(4, 1) Dagger.ThreadProc(2, 1)]
+    Ad = distribute(A, Blocks(2, 2), assignment_2d) 
+    # DMatrix(A, Blocks(2, 2), assignment_2d)
+    
+    assignment_1d = [Dagger.ThreadProc(2,1), Dagger.ThreadProc(3,1), Dagger.ThreadProc(1,1), Dagger.ThreadProc(4,1)]
+    vd = distribute(v, Blocks(3), assignment_1d) 
+    # DVector(v, Blocks(3), assignment_1d)
+    
+    assignment_3d = cat([Dagger.ThreadProc(1,1) Dagger.ThreadProc(2,1); Dagger.ThreadProc(3,1) Dagger.ThreadProc(4,1)],
+                        [Dagger.ThreadProc(4,1) Dagger.ThreadProc(3,1); Dagger.ThreadProc(2,1) Dagger.ThreadProc(1,1)], dims=3)
+    Md = distribute(M, Blocks(2, 2, 2), assignment_3d) 
+    # DArray(M, Blocks(2, 2, 2), assignment_3d)
+    
+    ```
+
+    If the assignment is a matrix of `Processor` objects, the blocks are assigned as follows:
+    For `Ad`:
+
+    ```julia
+    4×6 Matrix{Dagger.ThreadProc}:
+      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
+      ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)
+      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
+      ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)
+    
+    ```
+
+<!--  -->
+
 ## Broadcasting
 
 As the `DArray` is a subtype of `AbstractArray` and generally satisfies Julia's

From 74d73a9f5da0a99048b88be6b9bd7ce204cd19d2 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 28 Apr 2025 16:14:47 +0530
Subject: [PATCH 23/36] Update darray.jl


From af2da412f970e84971e5903f47462fbfb4e782ea Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 28 Apr 2025 16:16:24 +0530
Subject: [PATCH 24/36] Update allocation.jl

---
 test/array/allocation.jl | 259 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 259 insertions(+)

diff --git a/test/array/allocation.jl b/test/array/allocation.jl
index a95ef2efc..bbd132ace 100644
--- a/test/array/allocation.jl
+++ b/test/array/allocation.jl
@@ -201,6 +201,265 @@ end
     end
 end
 
+@testset "Constructor with assignment" begin
+   
+  availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
+  sort!(availprocs, by = x -> (x.owner, x.tid))
+  numprocs = length(availprocs)
+
+
+  function chunk_processors(Ad::DArray)
+      [processor(Ad.chunks[idx].future.future.v.value[2]) for idx in CartesianIndices(size(domainchunks(Ad)))]
+  end
+
+  function tile_processors(proc_grid::AbstractArray{<:Processor,N}, block_grid::Tuple{Vararg{Int,N}}) where N
+      reps       = Int.(ceil.(block_grid ./ size(proc_grid)))
+      tiled      = repeat(proc_grid, reps...)
+      idx_slices = [1:block_grid[d] for d in 1:length(block_grid)]
+      return tiled[idx_slices...]
+  end
+
+  function get_default_blockgrid(data, numprocs)
+      ndims_data = ndims(data)
+      size_data  = size(data)
+      ntuple(i->i == ndims_data ? cld( size_data[ndims_data], cld(size_data[ndims_data], numprocs) ) : 1, ndims_data)
+  end
+
+
+  A = rand(41, 35, 12)
+  v = rand(23)
+  M = rand(76,118)
+
+  t_blocks_a = (4,3,2)
+  d_blocks_a = Blocks(t_blocks_a)
+  blocks_a   = cld.(size(A), t_blocks_a)
+
+  n_blocks_v = 3
+  t_blocks_v = (n_blocks_v,)
+  v_blocks_v = [n_blocks_v]
+  d_blocks_v = Blocks(t_blocks_v)
+  blocks_v   = cld.(size(v), t_blocks_v)
+  blocks_nv  = blocks_v[1]
+
+  t_blocks_m = (2,3)
+  d_blocks_m = Blocks(t_blocks_m)
+  blocks_m   = cld.(size(M), t_blocks_m)
+
+
+  @testset "Arbitrary Assignment (:arbitrary)" begin
+    assignment = :arbitrary
+
+    @testset "Auto Blocks" begin
+
+      @test distribute(A, assignment) isa DArray  && distribute(A, AutoBlocks(), assignment) isa DArray
+      @test distribute(v, assignment) isa DVector && distribute(v, AutoBlocks(), assignment) isa DVector
+      @test distribute(M, assignment) isa DMatrix && distribute(M, AutoBlocks(), assignment) isa DMatrix
+
+      @test DArray( A,    assignment) isa DArray  && DArray(    A, AutoBlocks(), assignment) isa DArray
+      @test DVector(v,    assignment) isa DVector && DVector(   v, AutoBlocks(), assignment) isa DVector
+      @test DMatrix(M,    assignment) isa DMatrix && DMatrix(   M, AutoBlocks(), assignment) isa DMatrix
+
+    end
+
+    @testset "Explicit Blocks" begin
+
+      @test distribute(A, d_blocks_a, assignment) isa DArray  && distribute(A, blocks_a, assignment) isa DArray
+      @test distribute(v, d_blocks_v, assignment) isa DVector && distribute(v, blocks_v,  assignment) isa DVector
+      @test distribute(v, n_blocks_v, assignment) isa DVector
+      # @test distribute(v, v_blocks_v, assignment) isa DVector ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      @test distribute(M, d_blocks_m, assignment) isa DMatrix && distribute(M, blocks_m, assignment) isa DMatrix
+
+      @test DArray( A, d_blocks_a, assignment) isa DArray
+      @test DVector(v, d_blocks_v, assignment) isa DVector
+      @test DMatrix(M, d_blocks_m, assignment) isa DMatrix
+
+    end
+
+  end
+
+
+  @testset "Blockcyclic Assignment (:blockcyclic)" begin
+    assignment = :blockcyclic
+
+    function get_default_procgrid(data, numprocs)
+      ndims_data = ndims(data)
+      reshape(availprocs, ntuple(i -> i == ndims_data ? numprocs : 1, ndims_data))
+    end
+
+    @testset "Auto Blocks" begin
+
+      dist_A_def_auto = distribute(A,               assignment); wait(dist_A_def_auto)
+      dist_A_auto_def = distribute(A, AutoBlocks(), assignment); wait(dist_A_auto_def)
+      dist_v_def_auto = distribute(v,               assignment); wait(dist_v_def_auto)
+      dist_v_auto_def = distribute(v, AutoBlocks(), assignment); wait(dist_v_auto_def)
+      dist_M_def_auto = distribute(M,               assignment); wait(dist_M_def_auto)
+      dist_M_auto_def = distribute(M, AutoBlocks(), assignment); wait(dist_M_auto_def)
+
+      darr_A_def_auto = DArray(    A,               assignment); wait(darr_A_def_auto)
+      darr_A_auto_def = DArray(    A, AutoBlocks(), assignment); wait(darr_A_auto_def)
+      dvec_v_def_auto = DVector(   v,               assignment); wait(dvec_v_def_auto)
+      dvec_v_auto_def = DVector(   v, AutoBlocks(), assignment); wait(dvec_v_auto_def)
+      dmat_M_def_auto = DMatrix(   M,               assignment); wait(dmat_M_def_auto)
+      dmat_M_auto_def = DMatrix(   M, AutoBlocks(), assignment); wait(dmat_M_auto_def)
+
+      @test chunk_processors(dist_A_def_auto) == chunk_processors(dist_A_auto_def) == chunk_processors(darr_A_def_auto) == chunk_processors(darr_A_auto_def) == tile_processors(get_default_procgrid(A, numprocs), get_default_blockgrid(A, numprocs))
+      @test chunk_processors(dist_v_def_auto) == chunk_processors(dist_v_auto_def) == chunk_processors(dvec_v_def_auto) == chunk_processors(dvec_v_auto_def) == tile_processors(get_default_procgrid(v, numprocs), get_default_blockgrid(v, numprocs))
+      @test chunk_processors(dist_M_def_auto) == chunk_processors(dist_M_auto_def) == chunk_processors(dmat_M_def_auto) == chunk_processors(dmat_M_auto_def) == tile_processors(get_default_procgrid(M, numprocs), get_default_blockgrid(M, numprocs))
+      
+    end
+
+    @testset "Explicit Blocks" begin
+
+      dist_A_exp_def     = distribute(A, d_blocks_a, assignment); wait(dist_A_exp_def)
+      dist_A_blocks_exp  = distribute(A, blocks_a,   assignment); wait(dist_A_blocks_exp)
+      dist_v_exp_def     = distribute(v, d_blocks_v, assignment); wait(dist_v_exp_def)
+      dist_v_blocks_exp  = distribute(v, blocks_v,   assignment); wait(dist_v_blocks_exp)
+      dist_v_nblocks_exp = distribute(v, blocks_nv,  assignment); wait(dist_v_nblocks_exp)
+      # dist_v_vblocks_exp = distribute(v, v_blocks_v, assignment); wait(dist_v_vblocks_exp)
+      dist_M_exp_def     = distribute(M, d_blocks_m, assignment); wait(dist_M_exp_def)
+      dist_M_blocks_exp  = distribute(M, blocks_m,   assignment); wait(dist_M_blocks_exp)
+
+      darr_A_exp_def     = DArray(    A, d_blocks_a, assignment); wait(darr_A_exp_def)
+      dvec_v_exp_def     = DVector(   v, d_blocks_v, assignment); wait(dvec_v_exp_def)
+      dmat_M_exp_def     = DMatrix(   M, d_blocks_m, assignment); wait(dmat_M_exp_def)
+
+
+      @test chunk_processors(dist_A_exp_def) == chunk_processors(dist_A_blocks_exp) == chunk_processors(darr_A_exp_def) == tile_processors(get_default_procgrid(A, numprocs), blocks_a)
+      @test chunk_processors(dist_v_exp_def) == chunk_processors(dist_v_blocks_exp) == chunk_processors(dvec_v_exp_def) == tile_processors(get_default_procgrid(v, numprocs), blocks_v)
+      @test chunk_processors(dist_v_nblocks_exp)  == tile_processors(get_default_procgrid(v, numprocs), blocks_v)
+      # @test chunk_processors(dist_v_vblocks_exp) == tile_processors(get_default_procgrid(v, numprocs), blocksv) ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      @test chunk_processors(dist_M_exp_def) == chunk_processors(dist_M_blocks_exp) == chunk_processors(dmat_M_exp_def) == tile_processors(get_default_procgrid(M, numprocs), blocks_m)
+      
+    end
+
+  end
+
+
+  @testset "OSProc ID Array Assignment (AbstractArray{<:Int, N})" begin
+
+    function get_random_osproc_ids(data)
+      ndims_data = ndims(data)
+      if     ndims_data == 3
+          return rand(procs(), 3, 2, 2)
+      elseif ndims_data == 1
+          return rand(procs(), 11)
+      elseif ndims_data == 2
+          return rand(procs(), 2, 5)
+      end
+    end
+
+    function get_random_osprocs(proc_ids)
+      [ThreadProc(proc, 1) for proc in proc_ids]
+    end
+
+    rand_osproc_ids_A = rand(procs(), 3, 2, 2)
+    rand_osproc_ids_v = rand(procs(), 11)
+    rand_osproc_ids_M = rand(procs(), 2, 5)
+
+    @testset "Auto Blocks" begin
+
+      dist_A_rand_osproc_auto = distribute(A,               rand_osproc_ids_A); wait(dist_A_rand_osproc_auto)
+      dist_A_auto_rand_osproc = distribute(A, AutoBlocks(), rand_osproc_ids_A); wait(dist_A_auto_rand_osproc)
+      # dist_v_rand_osproc_auto = distribute(v,               rand_osproc_ids_v); wait(dist_v_rand_osproc_auto)
+      dist_v_auto_rand_osproc = distribute(v, AutoBlocks(), rand_osproc_ids_v); wait(dist_v_auto_rand_osproc)
+      dist_M_rand_osproc_auto = distribute(M,               rand_osproc_ids_M); wait(dist_M_rand_osproc_auto)
+      dist_M_auto_rand_osproc = distribute(M, AutoBlocks(), rand_osproc_ids_M); wait(dist_M_auto_rand_osproc)
+
+      darr_A_rand_osproc_auto = DArray(    A,               rand_osproc_ids_A); wait(darr_A_rand_osproc_auto)
+      darr_A_auto_rand_osproc = DArray(    A, AutoBlocks(), rand_osproc_ids_A); wait(darr_A_auto_rand_osproc)
+      dvec_v_rand_osproc_auto = DVector(   v,               rand_osproc_ids_v); wait(dvec_v_rand_osproc_auto)
+      dvec_v_auto_rand_osproc = DVector(   v, AutoBlocks(), rand_osproc_ids_v); wait(dvec_v_auto_rand_osproc)
+      dmat_M_rand_osproc_auto = DMatrix(   M,               rand_osproc_ids_M); wait(dmat_M_rand_osproc_auto)
+      dmat_M_auto_rand_osproc = DMatrix(   M, AutoBlocks(), rand_osproc_ids_M); wait(dmat_M_auto_rand_osproc)
+
+      @test chunk_processors(dist_A_rand_osproc_auto) == chunk_processors(dist_A_auto_rand_osproc) == chunk_processors(darr_A_rand_osproc_auto) == chunk_processors(darr_A_auto_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_A), get_default_blockgrid(A, numprocs))
+      @test                                              chunk_processors(dist_v_auto_rand_osproc) == chunk_processors(dvec_v_rand_osproc_auto) == chunk_processors(dvec_v_auto_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_v), get_default_blockgrid(v, numprocs))
+      # @test chunk_processors(dist_v_rand_osproc_auto) == tile_processors(rand_osproc_ids_v, get_default_blockgrid(v, numprocs)) ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      @test chunk_processors(dist_M_rand_osproc_auto) == chunk_processors(dist_M_auto_rand_osproc) == chunk_processors(dmat_M_rand_osproc_auto) == chunk_processors(dmat_M_auto_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_M), get_default_blockgrid(M, numprocs))
+    end
+
+    @testset "Explicit Blocks" begin
+
+      dist_A_exp_rand_osproc     = distribute(A, d_blocks_a, rand_osproc_ids_A); wait(dist_A_exp_rand_osproc)
+      dist_A_blocks_rand_osproc  = distribute(A, blocks_a,   rand_osproc_ids_A); wait(dist_A_blocks_rand_osproc)
+      dist_v_exp_rand_osproc     = distribute(v, d_blocks_v, rand_osproc_ids_v); wait(dist_v_exp_rand_osproc)
+      dist_v_blocks_rand_osproc  = distribute(v, blocks_v,   rand_osproc_ids_v); wait(dist_v_blocks_rand_osproc)
+      dist_v_nblocks_rand_osproc = distribute(v, blocks_nv,  rand_osproc_ids_v); wait(dist_v_nblocks_rand_osproc)
+      # dist_v_vblocks_rand_osproc = distribute(v, v_blocks_v, rand_osproc_ids_v); wait(dist_v_vblocks_rand_osproc)
+      dist_M_exp_rand_osproc     = distribute(M, d_blocks_m, rand_osproc_ids_M); wait(dist_M_exp_rand_osproc)
+      dist_M_blocks_rand_osproc  = distribute(M, blocks_m,   rand_osproc_ids_M); wait(dist_M_blocks_rand_osproc)
+
+      darr_A_exp_rand_osproc     = DArray(    A, d_blocks_a, rand_osproc_ids_A); wait(darr_A_exp_rand_osproc)
+      dvec_v_exp_rand_osproc     = DVector(   v, d_blocks_v, rand_osproc_ids_v); wait(dvec_v_exp_rand_osproc)
+      dmat_M_exp_rand_osproc     = DMatrix(   M, d_blocks_m, rand_osproc_ids_M); wait(dmat_M_exp_rand_osproc)
+
+      @test chunk_processors(dist_A_exp_rand_osproc) == chunk_processors(dist_A_blocks_rand_osproc) == chunk_processors(darr_A_exp_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_A), blocks_a)
+      @test chunk_processors(dist_v_exp_rand_osproc) == chunk_processors(dist_v_blocks_rand_osproc) == chunk_processors(dvec_v_exp_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_v), blocks_v)
+      @test chunk_processors(dist_v_nblocks_rand_osproc)                                                                                        == tile_processors(get_random_osprocs(rand_osproc_ids_v), blocks_v)
+      # @test chunk_processors(dist_v_vblocks_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_v), blocksv) ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      @test chunk_processors(dist_M_exp_rand_osproc) == chunk_processors(dist_M_blocks_rand_osproc) == chunk_processors(dmat_M_exp_rand_osproc) == tile_processors(get_random_osprocs(rand_osproc_ids_M), blocks_m)
+
+    end
+
+  end
+
+
+  @testset "Explicit Processor Array Assignment (AbstractArray{<:Processor, N})" begin
+
+    rand_procs_A = reshape(availprocs[ rand(procs(),  6) ], 2, 3, 1)
+    rand_procs_v = reshape(availprocs[ rand(procs(),  5) ], 5)
+    rand_procs_M = reshape(availprocs[ rand(procs(), 14) ], 2, 7)
+
+
+    @testset "Auto Blocks" begin
+
+      dist_A_rand_procs_auto = distribute(A,               rand_procs_A); wait(dist_A_rand_procs_auto)
+      dist_A_auto_rand_procs = distribute(A, AutoBlocks(), rand_procs_A); wait(dist_A_auto_rand_procs)
+      dist_v_rand_procs_auto = distribute(v,               rand_procs_v); wait(dist_v_rand_procs_auto)
+      dist_v_auto_rand_procs = distribute(v, AutoBlocks(), rand_procs_v); wait(dist_v_auto_rand_procs)
+      dist_M_rand_procs_auto = distribute(M,               rand_procs_M); wait(dist_M_rand_procs_auto)
+      dist_M_auto_rand_procs = distribute(M, AutoBlocks(), rand_procs_M); wait(dist_M_auto_rand_procs)
+
+      darr_A_rand_procs_auto = DArray(    A,               rand_procs_A); wait(darr_A_rand_procs_auto)
+      darr_A_auto_rand_procs = DArray(    A, AutoBlocks(), rand_procs_A); wait(darr_A_auto_rand_procs)
+      dvec_v_rand_procs_auto = DVector(   v,               rand_procs_v); wait(dvec_v_rand_procs_auto)
+      dvec_v_auto_rand_procs = DVector(   v, AutoBlocks(), rand_procs_v); wait(dvec_v_auto_rand_procs)
+      dmat_M_rand_procs_auto = DMatrix(   M,               rand_procs_M); wait(dmat_M_rand_procs_auto)
+      dmat_M_auto_rand_procs = DMatrix(   M, AutoBlocks(), rand_procs_M); wait(dmat_M_auto_rand_procs)
+
+      @test chunk_processors(dist_A_rand_procs_auto) == chunk_processors(dist_A_auto_rand_procs) == chunk_processors(darr_A_rand_procs_auto) == chunk_processors(darr_A_auto_rand_procs) == tile_processors(rand_procs_A, get_default_blockgrid(A, numprocs))
+      @test chunk_processors(dist_v_rand_procs_auto) == chunk_processors(dist_v_auto_rand_procs) == chunk_processors(dvec_v_rand_procs_auto) == chunk_processors(dvec_v_auto_rand_procs) == tile_processors(rand_procs_v, get_default_blockgrid(v, numprocs))
+      @test chunk_processors(dist_M_rand_procs_auto) == chunk_processors(dist_M_auto_rand_procs) == chunk_processors(dmat_M_rand_procs_auto) == chunk_processors(dmat_M_auto_rand_procs) == tile_processors(rand_procs_M, get_default_blockgrid(M, numprocs))
+
+    end
+
+    @testset "Explicit Blocks" begin
+
+      dist_A_exp_rand_procs     = distribute(A, d_blocks_a, rand_procs_A); wait(dist_A_exp_rand_procs)
+      dist_A_blocks_rand_procs  = distribute(A, blocks_a,   rand_procs_A); wait(dist_A_blocks_rand_procs)
+      dist_v_exp_rand_procs     = distribute(v, d_blocks_v, rand_procs_v); wait(dist_v_exp_rand_procs)
+      dist_v_blocks_rand_procs  = distribute(v, blocks_v,   rand_procs_v); wait(dist_v_blocks_rand_procs)
+      dist_v_nblocks_rand_procs = distribute(v, blocks_nv, rand_procs_v); wait(dist_v_nblocks_rand_procs)
+      # dist_v_vblocks_rand_procs = distribute(v, v_blocks_v, rand_procs_v); wait(dist_v_vblocks_rand_procs)
+      dist_M_exp_rand_procs     = distribute(M, d_blocks_m, rand_procs_M); wait(dist_M_exp_rand_procs)
+      dist_M_blocks_rand_procs  = distribute(M, blocks_m,   rand_procs_M); wait(dist_M_blocks_rand_procs)
+
+      darr_A_exp_rand_procs     = DArray(    A, d_blocks_a, rand_procs_A); wait(darr_A_exp_rand_procs)
+      dvec_v_exp_rand_procs     = DVector(   v, d_blocks_v, rand_procs_v); wait(dvec_v_exp_rand_procs)
+      dmat_M_exp_rand_procs     = DMatrix(   M, d_blocks_m, rand_procs_M); wait(dmat_M_exp_rand_procs)
+
+      @test chunk_processors(dist_A_exp_rand_procs)     == chunk_processors(dist_A_blocks_rand_procs)  == chunk_processors(darr_A_exp_rand_procs) == tile_processors(rand_procs_A, blocks_a)
+      @test chunk_processors(dist_v_exp_rand_procs)     == chunk_processors(dist_v_blocks_rand_procs)  == chunk_processors(dvec_v_exp_rand_procs) == tile_processors(rand_procs_v, blocks_v)
+      @test chunk_processors(dist_v_nblocks_rand_procs)                                                                                           == tile_processors(rand_procs_v, blocks_v)
+      # @test chunk_processors(dist_v_vblocks_rand_procs) == tile_processors(rand_procs_v, blocks_v) ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      @test chunk_processors(dist_M_exp_rand_procs)     == chunk_processors(dist_M_blocks_rand_procs)  == chunk_processors(dmat_M_exp_rand_procs) == tile_processors(rand_procs_M, blocks_m)
+
+    end
+
+  end
+
+end
+
 @testset "view" begin
     A = rand(64, 64)
     DA = view(A, Blocks(8, 8))

From 76d3a25f20293a82aa3ecefc9bf7e5e8052f9012 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 28 Apr 2025 16:53:39 +0530
Subject: [PATCH 25/36] Update allocation.jl

---
 test/array/allocation.jl | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/array/allocation.jl b/test/array/allocation.jl
index bbd132ace..3475f4301 100644
--- a/test/array/allocation.jl
+++ b/test/array/allocation.jl
@@ -203,16 +203,16 @@ end
 
 @testset "Constructor with assignment" begin
    
-  availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
+  availprocs = [proc for i in procs() for proc in Dagger.get_processors(Dagger.OSProc(i))]
   sort!(availprocs, by = x -> (x.owner, x.tid))
   numprocs = length(availprocs)
 
 
   function chunk_processors(Ad::DArray)
-      [processor(Ad.chunks[idx].future.future.v.value[2]) for idx in CartesianIndices(size(domainchunks(Ad)))]
+      [Dagger.processor(Ad.chunks[idx].future.future.v.value[2]) for idx in CartesianIndices(size(Dagger.domainchunks(Ad)))]
   end
 
-  function tile_processors(proc_grid::AbstractArray{<:Processor,N}, block_grid::Tuple{Vararg{Int,N}}) where N
+  function tile_processors(proc_grid::AbstractArray{<:Dagger.Processor,N}, block_grid::Tuple{Vararg{Int,N}}) where N
       reps       = Int.(ceil.(block_grid ./ size(proc_grid)))
       tiled      = repeat(proc_grid, reps...)
       idx_slices = [1:block_grid[d] for d in 1:length(block_grid)]
@@ -231,18 +231,18 @@ end
   M = rand(76,118)
 
   t_blocks_a = (4,3,2)
-  d_blocks_a = Blocks(t_blocks_a)
+  d_blocks_a = Dagger.Blocks(t_blocks_a)
   blocks_a   = cld.(size(A), t_blocks_a)
 
   n_blocks_v = 3
   t_blocks_v = (n_blocks_v,)
   v_blocks_v = [n_blocks_v]
-  d_blocks_v = Blocks(t_blocks_v)
+  d_blocks_v = Dagger.Blocks(t_blocks_v)
   blocks_v   = cld.(size(v), t_blocks_v)
   blocks_nv  = blocks_v[1]
 
   t_blocks_m = (2,3)
-  d_blocks_m = Blocks(t_blocks_m)
+  d_blocks_m = Dagger.Blocks(t_blocks_m)
   blocks_m   = cld.(size(M), t_blocks_m)
 
 
@@ -266,7 +266,7 @@ end
       @test distribute(A, d_blocks_a, assignment) isa DArray  && distribute(A, blocks_a, assignment) isa DArray
       @test distribute(v, d_blocks_v, assignment) isa DVector && distribute(v, blocks_v,  assignment) isa DVector
       @test distribute(v, n_blocks_v, assignment) isa DVector
-      # @test distribute(v, v_blocks_v, assignment) isa DVector ## Failed: no method matching distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
+      # @test distribute(v, v_blocks_v, assignment) isa DVector ## )distribute(::Vector{Float64}, ::DomainBlocks{1}, ::Symbol)
       @test distribute(M, d_blocks_m, assignment) isa DMatrix && distribute(M, blocks_m, assignment) isa DMatrix
 
       @test DArray( A, d_blocks_a, assignment) isa DArray
@@ -340,21 +340,21 @@ end
     function get_random_osproc_ids(data)
       ndims_data = ndims(data)
       if     ndims_data == 3
-          return rand(procs(), 3, 2, 2)
+          return rand(Dagger.procs(), 3, 2, 2)
       elseif ndims_data == 1
-          return rand(procs(), 11)
+          return rand(Dagger.procs(), 11)
       elseif ndims_data == 2
-          return rand(procs(), 2, 5)
+          return rand(Dagger.procs(), 2, 5)
       end
     end
 
     function get_random_osprocs(proc_ids)
-      [ThreadProc(proc, 1) for proc in proc_ids]
+      [Dagger.ThreadProc(proc, 1) for proc in proc_ids]
     end
 
-    rand_osproc_ids_A = rand(procs(), 3, 2, 2)
-    rand_osproc_ids_v = rand(procs(), 11)
-    rand_osproc_ids_M = rand(procs(), 2, 5)
+    rand_osproc_ids_A = rand(Dagger.procs(), 3, 2, 2)
+    rand_osproc_ids_v = rand(Dagger.procs(), 11)
+    rand_osproc_ids_M = rand(Dagger.procs(), 2, 5)
 
     @testset "Auto Blocks" begin
 
@@ -406,9 +406,9 @@ end
 
   @testset "Explicit Processor Array Assignment (AbstractArray{<:Processor, N})" begin
 
-    rand_procs_A = reshape(availprocs[ rand(procs(),  6) ], 2, 3, 1)
-    rand_procs_v = reshape(availprocs[ rand(procs(),  5) ], 5)
-    rand_procs_M = reshape(availprocs[ rand(procs(), 14) ], 2, 7)
+    rand_procs_A = reshape(availprocs[ rand(Dagger.procs(),  6) ], 2, 3, 1)
+    rand_procs_v = reshape(availprocs[ rand(Dagger.procs(),  5) ], 5)
+    rand_procs_M = reshape(availprocs[ rand(Dagger.procs(), 14) ], 2, 7)
 
 
     @testset "Auto Blocks" begin

From ee3a42a414b8c6873c2c15cd65283ddac3d36012 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 28 Apr 2025 17:26:58 +0530
Subject: [PATCH 26/36] Update darray.md

---
 docs/src/darray.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/src/darray.md b/docs/src/darray.md
index d127615ed..6b577ea0b 100644
--- a/docs/src/darray.md
+++ b/docs/src/darray.md
@@ -310,7 +310,7 @@ M = rand(5, 5, 5) # 3D array
 3.  **Block-Cyclic Assignment with Integer Array:**
 
     ```julia
-    assignment_2d = [3 1; 4 2]
+    assignment_2d = [2 1; 4 3]
     Ad = distribute(A, Blocks(2, 2), assignment_2d) 
     # DMatrix(A, Blocks(2, 2), [3 1; 4 2])
     
@@ -330,10 +330,10 @@ M = rand(5, 5, 5) # 3D array
 
     ```julia
     4×6 Matrix{Dagger.ThreadProc}:
-      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
-      ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)
-      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
-      ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)
+      ThreadProc(2, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(1, 1)
+      ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)
+      ThreadProc(2, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(1, 1)  ThreadProc(2, 1)  ThreadProc(1, 1)
+      ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)  ThreadProc(4, 1)  ThreadProc(3, 1)
     
     ```
 

From e4b9a26db15f7984ae035cb0115405a27ca0a3e3 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Mon, 28 Apr 2025 17:29:06 +0530
Subject: [PATCH 27/36] Update darray.md

---
 docs/src/darray.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/darray.md b/docs/src/darray.md
index 6b577ea0b..2b74c7be7 100644
--- a/docs/src/darray.md
+++ b/docs/src/darray.md
@@ -340,8 +340,8 @@ M = rand(5, 5, 5) # 3D array
 4.  **Block-Cyclic Assignment with Processor Array:**
 
     ```julia
-    assignment_2d = [Dagger.ThreadProc(3, 1) Dagger.ThreadProc(1, 1);
-                     Dagger.ThreadProc(4, 1) Dagger.ThreadProc(2, 1)]
+    assignment_2d = [Dagger.ThreadProc(3, 2) Dagger.ThreadProc(1, 1);
+                     Dagger.ThreadProc(4, 3) Dagger.ThreadProc(2, 2)]
     Ad = distribute(A, Blocks(2, 2), assignment_2d) 
     # DMatrix(A, Blocks(2, 2), assignment_2d)
     
@@ -361,10 +361,10 @@ M = rand(5, 5, 5) # 3D array
 
     ```julia
     4×6 Matrix{Dagger.ThreadProc}:
-      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
-      ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)
-      ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)  ThreadProc(3, 1)  ThreadProc(1, 1)
-      ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)  ThreadProc(4, 1)  ThreadProc(2, 1)
+      ThreadProc(3, 2)  ThreadProc(1, 1)  ThreadProc(3, 2)  ThreadProc(1, 1)  ThreadProc(3, 2)  ThreadProc(1, 1)
+      ThreadProc(4, 3)  ThreadProc(2, 2)  ThreadProc(4, 3)  ThreadProc(2, 2)  ThreadProc(4, 3)  ThreadProc(2, 2)
+      ThreadProc(3, 2)  ThreadProc(1, 1)  ThreadProc(3, 2)  ThreadProc(1, 1)  ThreadProc(3, 2)  ThreadProc(1, 1)
+      ThreadProc(4, 3)  ThreadProc(2, 2)  ThreadProc(4, 3)  ThreadProc(2, 2)  ThreadProc(4, 3)  ThreadProc(2, 2)
     
     ```
 

From cdd74533708df2d4d03ef7ec13f83af22f073547 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 6 May 2025 21:22:53 +0530
Subject: [PATCH 28/36] Update thunk.jl

---
 src/thunk.jl | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/thunk.jl b/src/thunk.jl
index b24806f85..f959e9aba 100644
--- a/src/thunk.jl
+++ b/src/thunk.jl
@@ -73,6 +73,7 @@ mutable struct Thunk
     eager_ref::Union{DRef,Nothing}
     options::Any # stores scheduler-specific options
     propagates::Tuple # which options we'll propagate
+    data_scope::AbstractScope
     function Thunk(f, xs...;
                    syncdeps=nothing,
                    id::Int=next_id(),
@@ -85,6 +86,8 @@ mutable struct Thunk
                    eager_ref=nothing,
                    processor=nothing,
                    scope=nothing,
+                   compute_scope=AnyScope(),
+                   data_scope=AnyScope(),
                    options=nothing,
                    propagates=(),
                    kwargs...
@@ -105,11 +108,11 @@ mutable struct Thunk
         if options !== nothing
             @assert isempty(kwargs)
             new(f, xs, syncdeps_set, id, get_result, meta, persist, cache,
-                cache_ref, affinity, eager_ref, options, propagates)
+                cache_ref, affinity, eager_ref, options, propagates, data_scope)
         else
             new(f, xs, syncdeps_set, id, get_result, meta, persist, cache,
                 cache_ref, affinity, eager_ref, Sch.ThunkOptions(;kwargs...),
-                propagates)
+                propagates, data_scope)
         end
     end
 end
@@ -479,6 +482,17 @@ function spawn(f, args...; kwargs...)
     # Wrap f in a Chunk if necessary
     processor = haskey(options, :processor) ? options.processor : nothing
     scope = haskey(options, :scope) ? options.scope : nothing
+    compute_scope = haskey(options, :compute_scope) ? options.compute_scope : nothing
+    data_scope = haskey(options, :data_scope) ? options.data_scope : nothing
+
+    if compute_scope !== nothing && data_scope !== nothing
+        constrained = constrain(compute_scope, data_scope) 
+        if !(constrained isa Dagger.InvalidScope) 
+            compute_scope = constrained
+        end
+    end
+
+    scope = scope === nothing ? compute_scope : scope
     if !isnothing(processor) || !isnothing(scope)
         f = tochunk(f,
                     something(processor, get_options(:processor, OSProc())),

From c694da636e8a430fa6dadd831287c71af192628f Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 6 May 2025 21:24:53 +0530
Subject: [PATCH 29/36] Update Sch.jl

---
 src/sch/Sch.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl
index b894f4526..759392478 100644
--- a/src/sch/Sch.jl
+++ b/src/sch/Sch.jl
@@ -1086,7 +1086,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
                            thunk.get_result, thunk.persist, thunk.cache, thunk.meta, options,
                            propagated, ids, positions,
                            (log_sink=ctx.log_sink, profile=ctx.profile),
-                           sch_handle, state.uid])
+                           sch_handle, state.uid, thunk.data_scope])
     end
     # N.B. We don't batch these because we might get a deserialization
     # error due to something not being defined on the worker, and then we don't
@@ -1488,7 +1488,7 @@ function do_task(to_proc, task_desc)
         scope, Tf, data,
         send_result, persist, cache, meta,
         options, propagated, ids, positions,
-        ctx_vars, sch_handle, sch_uid = task_desc
+        ctx_vars, sch_handle, sch_uid, data_scope = task_desc
     ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile)
 
     from_proc = OSProc()
@@ -1696,7 +1696,7 @@ function do_task(to_proc, task_desc)
 
         # Construct result
         # TODO: We should cache this locally
-        send_result || meta ? res : tochunk(res, to_proc; device, persist, cache=persist ? true : cache,
+        send_result || meta ? res : tochunk(res, to_proc, data_scope; device, persist, cache=persist ? true : cache,
                                             tag=options.storage_root_tag,
                                             leaf_tag=something(options.storage_leaf_tag, MemPool.Tag()),
                                             retain=options.storage_retain)

From 14857f6ec3915acdd8b0e8b7c3d106675e442cb5 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Wed, 7 May 2025 11:44:05 +0530
Subject: [PATCH 30/36] Update thunk.jl

---
 src/thunk.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/thunk.jl b/src/thunk.jl
index f959e9aba..9dc15fe56 100644
--- a/src/thunk.jl
+++ b/src/thunk.jl
@@ -485,12 +485,12 @@ function spawn(f, args...; kwargs...)
     compute_scope = haskey(options, :compute_scope) ? options.compute_scope : nothing
     data_scope = haskey(options, :data_scope) ? options.data_scope : nothing
 
-    if compute_scope !== nothing && data_scope !== nothing
-        constrained = constrain(compute_scope, data_scope) 
-        if !(constrained isa Dagger.InvalidScope) 
-            compute_scope = constrained
-        end
-    end
+    # if compute_scope !== nothing && data_scope !== nothing
+    #     constrained = constrain(compute_scope, data_scope) 
+    #     if !(constrained isa Dagger.InvalidScope) 
+    #         compute_scope = constrained
+    #     end
+    # end
 
     scope = scope === nothing ? compute_scope : scope
     if !isnothing(processor) || !isnothing(scope)

From 10a09e46513e682060a4ccc13e1d85d4cf0ed27f Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:56:12 +0530
Subject: [PATCH 31/36] Update Sch.jl

---
 src/sch/Sch.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl
index 759392478..0ee2b48a8 100644
--- a/src/sch/Sch.jl
+++ b/src/sch/Sch.jl
@@ -14,7 +14,7 @@ import Random: randperm
 import Base: @invokelatest
 
 import ..Dagger
-import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, DTaskFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, LockedObject
+import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, DTaskFailedException, Chunk, WeakChunk, OSProc, AnyScope, DefaultScope, InvalidScope, LockedObject
 import ..Dagger: order, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, timespan_start, timespan_finish, procs, move, chunktype, processor, get_processors, get_parent, execute!, rmprocs!, task_processor, constrain, cputhreadtime
 import ..Dagger: @dagdebug, @safe_lock_spin1
 import DataStructures: PriorityQueue, enqueue!, dequeue_pair!, peek
@@ -733,7 +733,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
                 # proclist overrides scope selection
                 AnyScope()
             else
-                DefaultScope()
+                !(constrain(task.compute_scope, task.result_scope) isa InvalidScope) ? constrain(task.compute_scope, task.result_scope) : task.compute_scope
             end
         end
         for (_,input) in task.inputs
@@ -744,7 +744,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
                 input
             else
                 nothing
-            end
+            end 
             chunk isa Chunk || continue
             scope = constrain(scope, chunk.scope)
             if scope isa Dagger.InvalidScope
@@ -1086,7 +1086,7 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
                            thunk.get_result, thunk.persist, thunk.cache, thunk.meta, options,
                            propagated, ids, positions,
                            (log_sink=ctx.log_sink, profile=ctx.profile),
-                           sch_handle, state.uid, thunk.data_scope])
+                           sch_handle, state.uid, thunk.result_scope])
     end
     # N.B. We don't batch these because we might get a deserialization
     # error due to something not being defined on the worker, and then we don't
@@ -1488,7 +1488,7 @@ function do_task(to_proc, task_desc)
         scope, Tf, data,
         send_result, persist, cache, meta,
         options, propagated, ids, positions,
-        ctx_vars, sch_handle, sch_uid, data_scope = task_desc
+        ctx_vars, sch_handle, sch_uid, result_scope = task_desc
     ctx = Context(Processor[]; log_sink=ctx_vars.log_sink, profile=ctx_vars.profile)
 
     from_proc = OSProc()
@@ -1693,10 +1693,10 @@ function do_task(to_proc, task_desc)
             end
             timespan_finish(ctx, :storage_safe_scan, (;thunk_id, processor=to_proc), (;T=typeof(res)))
         end
-
+ 
         # Construct result
         # TODO: We should cache this locally
-        send_result || meta ? res : tochunk(res, to_proc, data_scope; device, persist, cache=persist ? true : cache,
+        send_result || meta ? res : tochunk(res, to_proc, result_scope; device, persist, cache=persist ? true : cache,
                                             tag=options.storage_root_tag,
                                             leaf_tag=something(options.storage_leaf_tag, MemPool.Tag()),
                                             retain=options.storage_retain)

From a6088c225e014fa8e2728ac37a78b58f852f4800 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:56:52 +0530
Subject: [PATCH 32/36] Update chunks.jl

---
 src/chunks.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/chunks.jl b/src/chunks.jl
index 1eb56714e..887aaf1aa 100644
--- a/src/chunks.jl
+++ b/src/chunks.jl
@@ -264,7 +264,7 @@ be used.
 
 All other kwargs are passed directly to `MemPool.poolset`.
 """
-function tochunk(x::X, proc::P=OSProc(), scope::S=AnyScope(); persist=false, cache=false, device=nothing, kwargs...) where {X,P,S}
+function tochunk(x::X, proc::P=OSProc(), scope::S=DefaultScope(); persist=false, cache=false, device=nothing, kwargs...) where {X,P,S}
     if device === nothing
         device = if Sch.walk_storage_safe(x)
             MemPool.GLOBAL_DEVICE[]
@@ -284,7 +284,7 @@ function savechunk(data, dir, f)
     end
     fr = FileRef(f, sz)
     proc = OSProc()
-    scope = AnyScope() # FIXME: Scoped to this node
+    scope = DefaultScope() # FIXME: Scoped to this node
     Chunk{typeof(data),typeof(fr),typeof(proc),typeof(scope)}(typeof(data), domain(data), fr, proc, scope, true)
 end
 

From 09b0f7db801cd8113523606d15a0a9aebc24723b Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:57:23 +0530
Subject: [PATCH 33/36] Update util.jl


From adef9d5d7305f7801fb9aa8e449c99722c3b2a48 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:58:08 +0530
Subject: [PATCH 34/36] Update thunk.jl

---
 src/thunk.jl | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/thunk.jl b/src/thunk.jl
index 9dc15fe56..05446acf2 100644
--- a/src/thunk.jl
+++ b/src/thunk.jl
@@ -73,7 +73,8 @@ mutable struct Thunk
     eager_ref::Union{DRef,Nothing}
     options::Any # stores scheduler-specific options
     propagates::Tuple # which options we'll propagate
-    data_scope::AbstractScope
+    compute_scope::AbstractScope
+    result_scope::AbstractScope
     function Thunk(f, xs...;
                    syncdeps=nothing,
                    id::Int=next_id(),
@@ -86,8 +87,8 @@ mutable struct Thunk
                    eager_ref=nothing,
                    processor=nothing,
                    scope=nothing,
-                   compute_scope=AnyScope(),
-                   data_scope=AnyScope(),
+                   compute_scope=DefaultScope(),
+                   result_scope=AnyScope(),
                    options=nothing,
                    propagates=(),
                    kwargs...
@@ -105,14 +106,14 @@ mutable struct Thunk
             end
         end
         @assert all(x->x isa Pair, xs)
-        if options !== nothing
+        if options !== nothing  
             @assert isempty(kwargs)
             new(f, xs, syncdeps_set, id, get_result, meta, persist, cache,
-                cache_ref, affinity, eager_ref, options, propagates, data_scope)
+                cache_ref, affinity, eager_ref, options, propagates, compute_scope, result_scope)
         else
             new(f, xs, syncdeps_set, id, get_result, meta, persist, cache,
                 cache_ref, affinity, eager_ref, Sch.ThunkOptions(;kwargs...),
-                propagates, data_scope)
+                propagates, compute_scope, result_scope)
         end
     end
 end
@@ -483,16 +484,8 @@ function spawn(f, args...; kwargs...)
     processor = haskey(options, :processor) ? options.processor : nothing
     scope = haskey(options, :scope) ? options.scope : nothing
     compute_scope = haskey(options, :compute_scope) ? options.compute_scope : nothing
-    data_scope = haskey(options, :data_scope) ? options.data_scope : nothing
+    result_scope = haskey(options, :result_scope) ? options.result_scope : nothing
 
-    # if compute_scope !== nothing && data_scope !== nothing
-    #     constrained = constrain(compute_scope, data_scope) 
-    #     if !(constrained isa Dagger.InvalidScope) 
-    #         compute_scope = constrained
-    #     end
-    # end
-
-    scope = scope === nothing ? compute_scope : scope
     if !isnothing(processor) || !isnothing(scope)
         f = tochunk(f,
                     something(processor, get_options(:processor, OSProc())),

From fc4409efdf4418baf1bf90203f293ede26670675 Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:58:36 +0530
Subject: [PATCH 35/36] Update darray.jl

---
 src/array/darray.jl | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/array/darray.jl b/src/array/darray.jl
index 613dcfc59..6e82af3f6 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -476,7 +476,7 @@ function stage(ctx::Context, d::Distribute)
             else
                 proc =  d.procgrid[CartesianIndex(mod1.(Tuple(I), size(d.procgrid))...)]
                 scope = ExactScope(proc)
-                Dagger.@spawn scope=scope Dagger.tochunk(d.data[c], proc, scope)
+                Dagger.@spawn compute_scope=scope identity(d.data[c])
             end
         end
     end
@@ -506,23 +506,35 @@ auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A))
 distribute(A::AbstractArray, assignment::Union{Symbol, AbstractArray{<:Int}, AbstractArray{<:Processor}} = :arbitrary) = distribute(A, AutoBlocks(), assignment)
 function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}} = :arbitrary) where {T,N} 
     procgrid = nothing
+    availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
+    sort!(availprocs, by = x -> (x.owner, x.tid))
     if assignment isa Symbol
         if assignment == :arbitrary
             procgrid = nothing
-        elseif assignment == :blockcyclic
+        elseif assignment == :blockrow
+            p = ntuple(i -> i == 1 ? Int(ceil(size(A,1) / dist.blocksize[1])) : 1, N)
+            rows_per_proc, extra = divrem(Int(ceil(size(A,1) / dist.blocksize[1])), num_processors())
+            counts = [rows_per_proc + (i <= extra ? 1 : 0) for i in 1:num_processors()]
+            procgrid = reshape(vcat(fill.(availprocs, counts)...), p)   
+        elseif assignment == :blockcol
+            p = ntuple(i -> i == N ? Int(ceil(size(A,N) / dist.blocksize[N])) : 1, N)
+            cols_per_proc, extra = divrem(Int(ceil(size(A,N) / dist.blocksize[N])), num_processors())
+            counts = [cols_per_proc + (i <= extra ? 1 : 0) for i in 1:num_processors()]
+            procgrid = reshape(vcat(fill.(availprocs, counts)...), p)
+        elseif assignment == :cyclicrow
+            p = ntuple(i -> i == 1 ? num_processors() : 1, N)
+            procgrid = reshape(availprocs, p)
+        elseif assignment == :cycliccol
             p = ntuple(i -> i == N ? num_processors() : 1, N)
-            availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
-            sortedavailprocs = sort!(availprocs, by = x -> (x.owner, x.tid)) 
-            procgrid = reshape(sortedavailprocs, p)
+            procgrid = reshape(availprocs, p)
         else
-            error("Unsupported assignment symbol: $assignment, use :arbitrary or :blockcyclic")
+            error("Unsupported assignment symbol: $assignment, use :arbitrary, :blockrow, :blockcol, :cyclicrow or :cycliccol")
         end
     elseif assignment isa AbstractArray{<:Int, N}
         missingprocs = filter(p -> p ∉ procs(), assignment)
         isempty(missingprocs) || error("Missing processors: $missingprocs")
         procgrid = [Dagger.ThreadProc(proc, 1) for proc in assignment]
     elseif assignment isa AbstractArray{<:Processor, N}
-        availprocs = [proc for i in procs() for proc in get_processors(OSProc(i))]
         missingprocs = filter(p -> p ∉ availprocs, assignment)
         isempty(missingprocs) || error("Missing processors: $missingprocs")
         procgrid = assignment

From 41bbe7e2b1c660b15cd414c0295dc1eeeb1c84af Mon Sep 17 00:00:00 2001
From: Akhil Akkapelli <41839847+AkhilAkkapelli@users.noreply.github.com>
Date: Tue, 13 May 2025 20:59:29 +0530
Subject: [PATCH 36/36] Create task-affinity.md

---
 docs/src/task-affinity.md | 47 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 docs/src/task-affinity.md

diff --git a/docs/src/task-affinity.md b/docs/src/task-affinity.md
new file mode 100644
index 000000000..16ecb8e9b
--- /dev/null
+++ b/docs/src/task-affinity.md
@@ -0,0 +1,47 @@
+```@meta
+CurrentModule = Dagger
+```
+
+# Task Affinity
+
+
+Dagger.jl's `@spawn` macro offers fine-grained control over task execution by using the `compute_scope` and `result_scope` options to precisely control where tasks run and where their results can be accessed.
+
+## Compute Scope
+
+`compute_scope` defines exactly where a task's computation must occur. This option overrides the standard `scope` option if both are provided.
+
+```julia
+g = Dagger.@spawn compute_scope=ExactScope(Dagger.ThreadProc(3, 1)) f(x,y)
+```
+
+In this example, task `f(x,y)` is scheduled to run specifically on thread 1 of processor 3.
+
+
+## Result Scope
+
+`result_scope` restricts the locations from which a task's result can be fetched. This is useful for managing data locality and access patterns.
+
+```julia
+g = Dagger.@spawn result_scope=ExactScope(Dagger.OSProc(2)) f(x,y)
+```
+
+Here, the result of `f(x,y)` (referenced by `g`) will be primarily accessible from worker process 2. Fetching from other locations might require data movement.
+
+## Interaction of compute_scope and result_scope
+
+When both `compute_scope` and `result_scope` are specified for a task, Scheduler determines the execution location based on their intersection:
+
+- **Intersection Exists:** If there is an intersection between the compute_scope and result_scope, the task's computation will be scheduled to occur within this intersection. This is the preferred scenario.
+
+- **No Intersection:** If there is no intersection, the task's computation will occur in the compute_scope. However, the result_scope will still be respected for accessing the result.
+ 
+### Syntax:
+```julia
+g = Dagger.@spawn compute_scope=ExactScope(Dagger.ThreadProc(3, 1)) result_scope=ExactScope(Dagger.ThreadProc(2, 2)) f(x,y)
+```
+
+In this case, the task computes on `Dagger.ThreadProc(3, 1)`. Result access is restricted to `Dagger.ThreadProc(2, 2)`.
+
+!!! note "Chunk Inputs"
+    If the input to `Dagger.@spawn` is already a `Dagger.tochunk`, the `compute_scope` and `result_scope` options will have no effect on the task's execution or result accessibility.