diff --git a/src/DualTVDD.jl b/src/DualTVDD.jl
index e4671fd832c4be7af6e307b77b208befac76c127..5cfbd964a104c1a4e5ea81ca1754a1adb19030b3 100644
--- a/src/DualTVDD.jl
+++ b/src/DualTVDD.jl
@@ -1,5 +1,6 @@
 module DualTVDD
 
+export DualTVDDAlgorithm
 export init, step!, fetch, run
 
 include("common.jl")
diff --git a/src/chambolle.jl b/src/chambolle.jl
index 7b89486f9171a89c72bb8ea877ee2aecb825e913..93de809d463ec61b25249c6f1187ee10f8163f29 100644
--- a/src/chambolle.jl
+++ b/src/chambolle.jl
@@ -49,22 +49,27 @@ struct ChambolleState{A,T,R,S,Sv,K1,K2} <: State
 end
 
 projnorm(v) = projnorm(v, false)
-projnorm(v, anisotropic) = anisotropic ? abs.(v) : norm(vec(v))
-const SVectorNest = SArray{<:Any,<:SArray{<:Any,Float64,1},1}
+projnorm(v::SVector, anisotropic) = anisotropic ? abs.(v) : norm(vec(v))
+#const SVectorNest = SArray{<:Any,<:SArray{<:Any,Float64,1},1}
 # TODO: respect anisotropic
-#@inline projnorm(v::SVectorNest, _) = norm(norm.(v))
-@inline projnorm(v::SVectorNest, _) = sqrt(sum(sum(v[i] .^ 2) for i in eachindex(v)))
+@inline projnorm(v, _) = norm(v)
+#@inline projnorm(v::SVectorNest, _) = sqrt(sum(sum(v[i] .^ 2) for i in eachindex(v)))
+
 
 function init(alg::ChambolleAlgorithm{<:DualTVL1ROFOpProblem})
     g = alg.problem.g
     λ = alg.problem.λ
     d = ndims(g)
+    ax = axes(g)
+
+    # TODO: generalize to SArray
+    p1type(T::Type{<:Real}) = SVector{d,T}
+    p1type(::Type{SVector{m,T}}) where {m, T} = SMatrix{m,d,T,m*d}
 
-    pv = zeros(d * length(reinterpret(Float64, g)))
     rv = zeros(eltype(g), length(g))
     sv = zero(rv)
 
-    p = extend(reshape(reinterpret(SVector{d,eltype(g)}, pv), size(g)), StaticKernels.ExtensionNothing())
+    p = extend(zeros(p1type(eltype(g)), ax), StaticKernels.ExtensionNothing())
     r = reshape(rv, size(g))
     s = extend(reshape(sv, size(g)), StaticKernels.ExtensionReplicate())
 
@@ -94,6 +99,11 @@ end
 
 function step!(ctx::ChambolleState)
     alg = ctx.algorithm
+    d = ndims(ctx.r)
+    kgrad = Kernel{ntuple(_->0:1, d)}(gradient)
+
+    λ = alg.problem.λ
+    p = ctx.p
 
     sv = vec(parent(ctx.s))
     rv = vec(ctx.r)
@@ -103,7 +113,12 @@ function step!(ctx::ChambolleState)
     # s = B * r
     mul!(sv, alg.problem.B, rv)
     # p = (p + τ*grad(s)) / (1 + τ/λ|grad(s)|)
-    map!(ctx.k2, ctx.p, ctx.s)
+
+    sgrad = map(kgrad, ctx.s)
+    f(p, sgrad, λ) = iszero(λ) ? zero(p) :
+        (p + alg.τ * sgrad) ./ (1. .+ projnorm(alg.τ * sgrad, alg.anisotropic) ./ λ)
+
+    ctx.p .= f.(ctx.p, sgrad, λ)
 
     return ctx
 end
diff --git a/src/common.jl b/src/common.jl
index 7293ac0ce952ef927e288a40d52cf1d986bc7ff8..4d5f6eb93d191ee9b273942b79a1586d8b3056c3 100644
--- a/src/common.jl
+++ b/src/common.jl
@@ -56,7 +56,7 @@ fetch_u(st) = recover_u(fetch(st), st.algorithm.problem)
 Base.intersect(a::CartesianIndices{d}, b::CartesianIndices{d}) where d =
     CartesianIndices(intersect.(a.indices, b.indices))
 
-@generated function gradient(w::StaticKernels.Window{<:Any,N}) where N
+@generated function gradient(w::StaticKernels.Window{<:Real,N}) where N
     i0 = ntuple(_->0, N)
     i1(k) = ntuple(i->Int(k==i), N)
 
@@ -67,6 +67,18 @@ Base.intersect(a::CartesianIndices{d}, b::CartesianIndices{d}) where d =
     end
 end
 
+@generated function gradient(w::StaticKernels.Window{S,N}) where {S<:SArray, N}
+    i0 = ntuple(_->0, N)
+    i1(k) = ntuple(i->Int(k==i), N)
+
+    wi = (:( w[$(i1(k)...)][$j] - w[$(i0...)][$j] ) for k in 1:N for j in 1:length(S))
+    return quote
+        Base.@_inline_meta
+        return @inbounds SArray{Tuple{$(size(S)...),N}}($(wi...))::
+            SArray{Tuple{$(size(S)...),N},$(eltype(S)),$(ndims(S)+1),$(length(S)*N)}
+    end
+end
+
 @generated function divergence(w::StaticKernels.Window{SVector{N,T},N}) where {N,T}
     i0 = ntuple(_->0, N)
     i1(k) = ntuple(i->Int(k==i), N)
@@ -79,6 +91,24 @@ end
     end
 end
 
+@generated function divergence(w::StaticKernels.Window{S,N}) where {S<:SArray,N}
+    T = eltype(S)
+    sz = size(S)
+    sz[end] == N || throw(ArgumentError("last eltype dimension does not match array dimensionality"))
+
+    i0 = ntuple(_->0, N)
+    i1(k) = ntuple(i->Int(k==i), N)
+
+    slice(k) = (ntuple(_->:, ndims(S)-1)..., k)
+
+    wi = (:((isnothing(w[$(i0...)]) ? zero($T) : w[$(i0...)][$(slice(k)...)]) -
+            (isnothing(w[$((.-i1(k))...)]) ? zero($T) : w[$((.-i1(k))...)][$(slice(k)...)])) for k in 1:N)
+    return quote
+        Base.@_inline_meta
+        return @inbounds +($(wi...))
+    end
+end
+
 function div_op(a::AbstractArray{<:StaticVector{N},N}) where N
     k = Kernel{ntuple(_->-1:1, ndims(a))}(k)
     ae = extend(a, StaticKernels.ExtensionNothing())
diff --git a/src/dualtvdd.jl b/src/dualtvdd.jl
index 6836fa1d137d8d7bad649acc262ea1485c7334af..bea9bf33d6f87767efae582b5c7326284b2a3c8b 100644
--- a/src/dualtvdd.jl
+++ b/src/dualtvdd.jl
@@ -1,4 +1,4 @@
-using Distributed: nworkers, workers
+using Distributed: workers
 using Outsource: Connector, outsource
 
 #import Serialization
@@ -27,11 +27,13 @@ struct DualTVDDAlgorithm{P,d} <: Algorithm{P}
     ninner::Int
     "prob -> Algorithm(::Problem, ...)"
     subalg::Function
-    function DualTVDDAlgorithm(problem; M, overlap, parallel=true, σ=parallel ? 1/4 : 1., ninner=10, subalg=x->ProjGradAlgorithm(x))
+    "worker ids used for distributed execution"
+    workers::Vector{Int}
+    function DualTVDDAlgorithm(problem; M, overlap, parallel=true, σ=parallel ? 1/4 : 1., ninner=10, subalg=x->ProjGradAlgorithm(x), workers=workers())
         if parallel == true && σ > 1/4
             @warn "parallel domain decomposition needs σ >= 1/4 for theoretical convergence"
         end
-        return new{typeof(problem), length(M)}(problem, M, overlap, parallel, σ, ninner, subalg)
+        return new{typeof(problem), length(M)}(problem, M, overlap, parallel, σ, ninner, subalg, workers)
     end
 end
 
@@ -59,10 +61,14 @@ function init(alg::DualTVDDAlgorithm{<:DualTVL1ROFOpProblem})
     subλ = [alg.problem.λ[subax[i]...] .* theta.(Ref(ax), Ref(subax[i]), Ref(alg.overlap), CartesianIndices(subax[i]))
         for i in CartesianIndices(subax)]
 
+    # TODO: generalize to SArray
+    p1type(T::Type{<:Real}) = SVector{d,T}
+    p1type(::Type{SVector{m,T}}) where {m, T} = SMatrix{m,d,T,m*d}
+
     # global dual variable
-    p = zeros(SVector{d,eltype(g)}, size(g))
+    p = zeros(p1type(eltype(g)), ax)
     # local dual variable
-    subp = [collect(reinterpret(Float64, zeros(SVector{d,eltype(g)}, prod(length.(x))))) for x in subax]
+    subp = [zeros(p1type(eltype(g)), sax) for sax in subax]
 
     # create subproblem contexts
     cids = chessboard_coloring(size(subax))
@@ -73,7 +79,7 @@ function init(alg::DualTVDDAlgorithm{<:DualTVL1ROFOpProblem})
 
             subprob = DualTVL1ROFOpProblem(subg[sidx], op_restrict(alg.problem.B, ax, subax[sidx]), subλ[sidx])
             wf = subworker(alg, alg.subalg(subprob))
-            wid = workers()[mod1(i, nworkers())]
+            wid = alg.workers[mod1(i, length(alg.workers))]
             cons[sidx] = outsource(wf, wid)
         end
     end
@@ -111,7 +117,7 @@ function subworker(alg, subalg)
             subg = take!(con)
             subalg.problem.g .= subg
             # run algorithm
-            for _ in 1:ninner
+            for _ in 1:1000
                 step!(subst)
             end
             # write result