diff --git a/src/DualTVDD.jl b/src/DualTVDD.jl
index 4c46b793a72ba923f099ca8d89eabaed64dc2141..aaa447ecdc662be95f79f64f485c63787c92b320 100644
--- a/src/DualTVDD.jl
+++ b/src/DualTVDD.jl
@@ -11,16 +11,22 @@ include("projgrad.jl")
 using Makie: heatmap
 
 function run()
-    g = ones(20,20)
+    #g = [0. 2; 1 0.]
+    g = rand(10,10)
     #g[4:17,4:17] .= 1
     #g[:size(g, 1)÷2,:] .= 1
     #g = [0. 2; 1 0.]
-    B = diagm(fill(100, length(g)))
-    α = 0.1
+    display(g)
+
+    B = 0.1 * rand(length(g), length(g))
+    B .+= diagm(ones(length(g)))
+    α = 0.025
+
+    display(norm(B))
 
     md = DualTVDD.OpROFModel(g, B, α)
-    alg = DualTVDD.ChambolleAlgorithm()
-    ctx = DualTVDD.init(md, alg)
+    ctx = DualTVDD.init(md, DualTVDD.ChambolleAlgorithm())
+    ctx2 = DualTVDD.init(md, DualTVDD.ProjGradAlgorithm(λ=1/sqrt(8)/norm(B)))
 
     #scene = vbox(
     #    heatmap(ctx.s, colorrange=(0,1), colormap=:gray, scale_plot=false, show_axis=false),
@@ -31,41 +37,51 @@ function run()
     #hm = last(scene)
     for i in 1:10000
         step!(ctx)
+        step!(ctx2)
         #hm[1] = ctx.s
         #yield()
         #sleep(0.2)
     end
-    display(ctx.p)
-    display(recover_u!(ctx))
+    display(copy(ctx.p))
+    display(copy(recover_u!(ctx)))
+    println(energy(ctx))
+
+    println()
 
-    ctx
+    display(copy(ctx2.p))
+    display(copy(recover_u!(ctx2)))
+    println(energy(ctx2))
+
+    ctx, ctx2
     #hm[1] = ctx.s
     #yield()
 end
 
 function rundd()
     β = 0
-    f = zeros(2,2)
-    f[1,:] .= 1
-    #g = [0. 2; 1 0.]
+    f = zeros(4,4)
+    f[1,1] = 1
+    #f = [0. 2; 1 0.]
+
+
     #A = diagm(vcat(fill(1, length(f)÷2), fill(1/10, length(f)÷2)))
-    A = rand(length(f), length(f))
-    display(A)
-    println(cond(A))
-    display(eigen(A))
-    #A = diagm(fill(1/2, length(f)))
+    #A = rand(length(f), length(f))
+    A = 0. * rand(length(f), length(f))
+    A .+= diagm(ones(length(f)))
+
+    #display(A)
     B = inv(A'*A + β*I)
-    println(norm(sqrt(B)))
+    println(norm(A))
 
     #println(norm(sqrt(B)))
 
     g = similar(f)
     vec(g) .= A' * vec(f)
 
-    α = .025
+    α = 1/4
 
     md = DualTVDD.DualTVDDModel(f, A, α, 0., 0.)
-    alg = DualTVDD.DualTVDDAlgorithm(M=(1,1), overlap=(1,1), σ=1)
+    alg = DualTVDD.DualTVDDAlgorithm(M=(2,2), overlap=(2,2), σ=1)
     ctx = DualTVDD.init(md, alg)
 
     md2 = DualTVDD.OpROFModel(g, B, α)
@@ -73,11 +89,13 @@ function rundd()
     ctx2 = DualTVDD.init(md2, alg2)
 
 
-    for i in 1:1
+    for i in 1:1000
         step!(ctx)
+        #println(energy(ctx))
     end
-    for i in 1:1000000
+    for i in 1:10000
         step!(ctx2)
+        #println(energy(ctx2))
     end
 
     #scene = heatmap(ctx.s,
@@ -95,11 +113,11 @@ function rundd()
     #hm[1] = ctx.s
     #yield()
 
-    println("p result")
+    println("\np result")
     display(ctx.p)
     display(ctx2.p)
 
-    println("u result")
+    println("\nu result")
     display(recover_u!(ctx))
     display(recover_u!(ctx2))
 
@@ -111,7 +129,7 @@ end
 
 function run3()
     f = rand(20,20)
-    A = rand(length(f), length(f))
+    A = 0.1*rand(length(f), length(f))
     A .+= diagm(ones(length(f)))
 
     g = reshape(A'*vec(f), size(f))
@@ -160,9 +178,10 @@ function energy(ctx::Union{DualTVDDContext,ProjGradContext})
     # v = div(p) + A'*f
     map!(kΛ, v, extend(ctx.p, StaticKernels.ExtensionNothing()))
     v .+= ctx.g
+    #display(v)
 
+    # |v|_B^2 / 2
     u = ctx.B * vec(v)
-
     return sum(u .* vec(v)) / 2
 end
 
@@ -177,9 +196,10 @@ function energy(ctx::ChambolleContext)
     # v = div(p) + g
     map!(kΛ, v, extend(ctx.p, StaticKernels.ExtensionNothing()))
     v .+= ctx.model.g
+    #display(v)
 
+    # |v|_B^2 / 2
     u = ctx.model.B * vec(v)
-
     return sum(u .* vec(v)) / 2
 end
 
diff --git a/src/chambolle.jl b/src/chambolle.jl
index afbf8841a06e94ecbc7fc0c9697a48edcd069475..62d838fa9bbef57a7c39c3c6ea3084d3c8acfc93 100644
--- a/src/chambolle.jl
+++ b/src/chambolle.jl
@@ -16,7 +16,7 @@ using LinearAlgebra
 struct ChambolleAlgorithm <: Algorithm
     "fixed point inertia parameter"
     τ::Float64
-    function ChambolleAlgorithm(; τ=1/4)
+    function ChambolleAlgorithm(; τ=1/8)
         return new(τ)
     end
 end
diff --git a/src/dualtvdd.jl b/src/dualtvdd.jl
index 588b3ea0658c5e4d4f8b378ad1e20bd0cda7c09c..7136f22b9314ab17ff4fec2e495d32cd8b2e2815 100644
--- a/src/dualtvdd.jl
+++ b/src/dualtvdd.jl
@@ -34,11 +34,11 @@ function init(md::DualTVDDModel, alg::DualTVDDAlgorithm)
     ax = axes(md.f)
 
     # subdomain axes
-    subax = subaxes(md.f, alg.M, alg.overlap)
+    subax = subaxes(size(md.f), alg.M, alg.overlap)
     # preallocated data for subproblems
-    subg = [Array{Float64, d}(undef, length.(subax[i])) for i in CartesianIndices(subax)]
+    subg = [Array{Float64, d}(undef, length.(ax)) for i in CartesianIndices(subax)]
     # locally dependent tv parameter
-    subα = [md.α .* theta.(Ref(ax), Ref(subax[i]), Ref(alg.overlap), CartesianIndices(subax[i]))
+    subα = [md.α .* theta.(Ref(ax), Ref(subax[i]), Ref(alg.overlap), CartesianIndices(ax))
         for i in CartesianIndices(subax)]
 
     # this is the global g, the local gs are getting initialized in step!()
@@ -53,9 +53,8 @@ function init(md::DualTVDDModel, alg::DualTVDDAlgorithm)
     B = diagm(ones(length(md.f))) + md.β * I
 
     # create subproblem contexts
-    # TODO: extraction of B subparts only makes sense for blockdiagonal B (i.e. A too)
     li = LinearIndices(size(md.f))
-    submds = [OpROFModel(subg[i], B[vec(li[subax[i]...]), vec(li[subax[i]...])], subα[i])
+    submds = [OpROFModel(subg[i], B, subα[i])
         for i in CartesianIndices(subax)]
     subalg = ChambolleAlgorithm()
     subctx = [init(submds[i], subalg) for i in CartesianIndices(subax)]
@@ -67,62 +66,32 @@ function init(md::DualTVDDModel, alg::DualTVDDAlgorithm)
 end
 
 function step!(ctx::DualTVDDContext)
+    σ = ctx.algorithm.σ
     d = ndims(ctx.p)
     ax = axes(ctx.p)
     overlap = ctx.algorithm.overlap
-    li = LinearIndices(size(ctx.model.f))
 
-    @inline kfΛ(w) = @inbounds -divergence_global(w)
+    @inline kfΛ(w) = @inbounds divergence_global(w)
     kΛ = Kernel{ntuple(_->-1:1, d)}(kfΛ)
 
-    λ = 2*norm(sqrt(ctx.B))^2 # TODO: algorithm parameter
-
     # call run! on each cell (this can be threaded)
-    for i in eachindex(ctx.subctx)
-        sax = ctx.subax[i]
-        ci = CartesianIndices(sax)
+    for (i, sax) in pairs(ctx.subax)
+        sg = ctx.subg[i] # julia-bug workaround
 
         # TODO: make p computation local!
-        # g_i = (A*f - Λ(1-theta_i)p^n)|_{\Omega_i}
-        # subctx[i].p is used as a buffer
-
-        ctx.ptmp .= .-(1 .- theta.(Ref(ax), Ref(sax), Ref(overlap), CartesianIndices(ctx.p))) .* ctx.p
-        #tmp3 = .-(1 .- theta.(Ref(ax), Ref(sax), Ref(overlap), CartesianIndices(ctx.p)))
-        #ctx.subctx[i].p .= .-(1 .- theta.(Ref(ax), Ref(sax), Ref(overlap), ci)) .* ctx.p[ctx.subax[i]...]
-
-        ctx.subg[i] .= map(kΛ, ctx.ptmp)[sax...]
-        #map!(kΛ, ctx.subg[i], ctx.subctx[i].p)
-
-        ctx.subg[i] .+= ctx.g[sax...]
-        # set sensible starting value
-        reset!(ctx.subctx[i])
-
-        # precomputed: B/λ * (A'f - Λ(1-θ_i)p^n)
-        gloc = copy(ctx.subg[i])
-
-        # v_0
-        ctx.ptmp .= theta.(Ref(ax), Ref(sax), Ref(overlap), CartesianIndices(ctx.p)) .* ctx.p
-        ctx.subctx[i].p .= ctx.ptmp[sax...]
+        ctx.ptmp .= (1 .- theta.(Ref(ax), Ref(sax), Ref(overlap), CartesianIndices(ctx.p))) .* ctx.p
+        map!(kΛ, sg, ctx.ptmp)
+        sg .+= ctx.g
 
-        # subcontext B is identity!
-        subIB = I - ctx.B[vec(li[sax...]), vec(li[sax...])]./λ
-        subB = ctx.B[vec(li[sax...]), vec(li[sax...])]./λ
-
-        for j in 1:10000
-            subΛp = map(kΛ, ctx.subctx[i].p)
-            vec(ctx.subg[i]) .= subIB * vec(subΛp) .+ subB * vec(gloc)
-
-            for k in 1:1000
-                step!(ctx.subctx[i])
-            end
+        for j in 1:1000
+            step!(ctx.subctx[i])
         end
     end
 
     # aggregate (not thread-safe!)
-    σ = ctx.algorithm.σ
     ctx.p .*= 1 - σ
-    for i in CartesianIndices(ctx.subax)
-        ctx.p[ctx.subax[i]...] .+= σ .* ctx.subctx[i].p
+    for (i, sax) in pairs(ctx.subax)
+        ctx.p .+= σ .* ctx.subctx[i].p
     end
 end
 
@@ -167,8 +136,8 @@ This assumes that subdomains have size at least 2 .* overlap.
 theta(ax, sax, overlap, i::CartesianIndex) = prod(theta.(ax, sax, overlap, Tuple(i)))
 theta(ax, sax, overlap::Int, i::Int) =
     max(0., min(1.,
-        first(ax) == first(sax) && i < first(ax) + overlap ? 1. : (i - first(sax)) / overlap,
-        last(ax) == last(sax) && i > last(ax) - overlap ? 1. : (last(sax) - i) / overlap))
+        first(ax) == first(sax) && i < first(ax) + overlap ? 1. : (i - first(sax) + 1) / (overlap + 1),
+        last(ax) == last(sax) && i > last(ax) - overlap ? 1. : (last(sax) - i + 1) / (overlap + 1)))
 
 
 """
@@ -177,21 +146,21 @@ theta(ax, sax, overlap::Int, i::Int) =
 Determine axes for all subdomains, given per dimension number of domains
 `pnum` and overlap `overlap`
 """
-function subaxes(domain, pnum, overlap)
-    overlap = 1 .+ overlap
-    d = ndims(domain)
-    tsize = size(domain) .+ (pnum .- 1) .* overlap
-
-    psize = tsize .÷ pnum
-    osize = tsize .- pnum .* psize
-
-    overhang(I, j) = I[j] == pnum[j] ? osize[j] : 0
+function subaxes(sizes, pnum, overlap)
+    d = Val(length(sizes))
+    ax = partition.(sizes, pnum, overlap)
+    return [ntuple(i -> ax[i][I[i]], d) for I in CartesianIndices(pnum)]
+end
 
-    indices = Array{NTuple{d, UnitRange{Int}}, d}(undef, pnum)
-    for I in CartesianIndices(pnum)
-        indices[I] = ntuple(j -> ((I[j] - 1) * psize[j] - (I[j] - 1) * overlap[j] + 1) :
-                                 (      I[j] * psize[j] - (I[j] - 1) * overlap[j] + overhang(I, j)), d)
+function partition(n, k, o)
+    part = UnitRange[]
+    i = 1
+    while k > 1
+        s = (n + (k-1)*o) ÷ k
+        push!(part, i:i+s-1)
+        i = i + s - o
+        n -= s - o
+        k -= 1
     end
-    @assert all(length.(sax) >= 1 .* overlap for sax in indices)
-    return indices
+    push!(part, i:i+n-1)
 end
diff --git a/src/types.jl b/src/types.jl
index f80834cdd41fcc0a017d906319ae522b6642804b..c5248b30f7f56731beac13b87edba74e52d26dc0 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -29,7 +29,7 @@ struct DualTVDDModel{U,VV} <: Model
     γ::Float64
 end
 
-"min_p  1/2 * |div(p) - g|_B^2 + χ_{|p|<=λ}"
+"min_p  1/2 * |-div(p) - g|_B^2 + χ_{|p|<=λ}"
 struct OpROFModel{U,VV,Λ} <: Model
     "given data"
     g::U