diff --git a/scripts/Manifest.toml b/scripts/Manifest.toml
index 0ad0147ff4f18b2616de879323d722c2ffe46422..73c8781a5cee7114a879ccb83b7a1e65c83447dd 100644
--- a/scripts/Manifest.toml
+++ b/scripts/Manifest.toml
@@ -430,6 +430,18 @@ git-tree-sha1 = "a2951c93684551467265e0e32b577914f69532be"
 uuid = "82e4d734-157c-48bb-816b-45c225c6df19"
 version = "0.5.9"
 
+[[deps.ImageMagick]]
+deps = ["FileIO", "ImageCore", "ImageMagick_jll", "InteractiveUtils"]
+git-tree-sha1 = "ca8d917903e7a1126b6583a097c5cb7a0bedeac1"
+uuid = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
+version = "1.2.2"
+
+[[deps.ImageMagick_jll]]
+deps = ["JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pkg", "Zlib_jll", "libpng_jll"]
+git-tree-sha1 = "1c0a2295cca535fabaf2029062912591e9b61987"
+uuid = "c73af94c-d91f-53ed-93a7-00f77d67a9d7"
+version = "6.9.10-12+3"
+
 [[deps.ImageMorphology]]
 deps = ["ImageCore", "LinearAlgebra", "Requires", "TiledIteration"]
 git-tree-sha1 = "5581e18a74a5838bd919294a7138c2663d065238"
diff --git a/scripts/Project.toml b/scripts/Project.toml
index 8647ef31dd2d1c5ad6c0d50782fdcd2118540656..ef4af9d6ab57f0366d2cebd285a5cfac65054962 100644
--- a/scripts/Project.toml
+++ b/scripts/Project.toml
@@ -5,6 +5,7 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
+ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageQualityIndexes = "2996bd0c-7a13-11e9-2da2-2f5ce47296a9"
 OpticalFlowUtils = "ab0dad50-ab19-448c-b796-13553ec8b2d3"
 PGFPlotsX = "8314cec4-20b6-5062-9cdb-752b83310925"
diff --git a/scripts/run.jl b/scripts/run.jl
index f32f1538cbe81b1f7c2beac5fd47ce7dd498d29c..8d8114bc8875aad6b3c5286fc0c0cdf6438f6673 100644
--- a/scripts/run.jl
+++ b/scripts/run.jl
@@ -5,3 +5,4 @@ const datapath = joinpath(@__DIR__, "..", "data")
 ctx = Util.Context(datapath)
 
 ctx(experiment_pd-comparison, "pd-comparison")
+ctx(experiment_optflow_middlebury_all, "fem/optflow/middlebury")
diff --git a/scripts/run_experiments.jl b/scripts/run_experiments.jl
index f2e80e944ca90580f11bc3bef3225ea388ca9dc6..0152e0451923df88494b169b90354513941dd1ad 100644
--- a/scripts/run_experiments.jl
+++ b/scripts/run_experiments.jl
@@ -29,7 +29,8 @@ using .Util
 
 grayclamp(value) = Gray(clamp(value, 0., 1.))
 loadimg(x) = Float64.(FileIO.load(x))
-saveimg(io, x) = FileIO.save(io, grayclamp.(x))
+saveimg(io, x::Array{<:Gray}) = FileIO.save(io, grayclamp.(x))
+saveimg(io, x) = FileIO.save(io, x)
 
 # convert image to/from image coordinate system
 from_img(arr) = permutedims(reverse(arr; dims = 1))
@@ -50,10 +51,8 @@ equidistant datapoints on a log-scale. `a` controls density.
 """
 logfilter(dt; a=20) = filter(:k => k -> round(a*log(k+1)) - round(a*log(k)) > 0, dt)
 
-struct L1L2TVState{M, Ttype, Stype}
+struct L1L2TVState{M, m, Ttype, Stype}
     mesh::M
-    d::Int # = ndims_domain(mesh)
-    m::Int
 
     T::Ttype
     tdata
@@ -76,8 +75,23 @@ struct L1L2TVState{M, Ttype, Stype}
     dp2::FeFunction
 end
 
-function L1L2TVState(mesh, m; T, tdata, S,
-	alpha1, alpha2, beta, lambda, gamma1, gamma2)
+# for swapping out mesh and discrete functions
+# TODO: any of state, mesh or functions should probably be mutable instead
+function L1L2TVState(st::L1L2TVState; mesh, tdata, est, g, u, p1, p2, du, dp1, dp2)
+    mesh === tdata.space.mesh == est.space.mesh == g.space.mesh ==
+            u.space.mesh == p1.space.mesh == p2.space.mesh ==
+            du.space.mesh == dp1.space.mesh == dp2.space.mesh ||
+        throw(ArgumentError("functions have different meshes"))
+
+    return L1L2TVState{typeof(mesh),ndims_u_codomain(st),typeof(st.T),typeof(st.S)}(
+        mesh, st.T, tdata, st.S,
+	st.alpha1, st.alpha2, st.beta, st.lambda, st.gamma1, st.gamma2,
+	est, g, u, p1, p2, du, dp1, dp2)
+end
+
+# usual constructor
+function L1L2TVState{m}(mesh; T, tdata, S,
+	alpha1, alpha2, beta, lambda, gamma1, gamma2) where m
     alpha2 > 0 || beta > 0 ||
         throw(ArgumentError("operator B is singular with these parameters"))
 
@@ -107,7 +121,8 @@ function L1L2TVState(mesh, m; T, tdata, S,
     dp1.data .= 0
     dp2.data .= 0
 
-    return L1L2TVState(mesh, d, m, T, tdata, S,
+    return L1L2TVState{typeof(mesh),m,typeof(T),typeof(S)}(
+        mesh, T, tdata, S,
 	alpha1, alpha2, beta, lambda, gamma1, gamma2,
 	est, g, u, p1, p2, du, dp1, dp2)
 end
@@ -152,11 +167,15 @@ function OptFlowState(mesh;
     dp1.data .= 0
     dp2.data .= 0
 
-    return L1L2TVState(mesh, d, m, T, tdata, S,
+    return L1L2TVState{typeof(mesh),m,typeof(T),typeof(S)}(
+        mesh, T, tdata, S,
 	alpha1, alpha2, beta, lambda, gamma1, gamma2,
 	est, g, u, p1, p2, du, dp1, dp2)
 end
 
+ndims_u_codomain(st::L1L2TVState{<:Any,m}) where m = m
+
+
 "calculate l1 norm on reference triangle from langrange dofs"
 function p1_ref_l1_expr(ldofs)
     u0, u1, u2 = ldofs
@@ -200,15 +219,15 @@ function p2_project!(p2, lambda)
     end
 end
 
-function step!(ctx::L1L2TVState)
-    T = ctx.T
-    S = ctx.S
-    alpha1 = ctx.alpha1
-    alpha2 = ctx.alpha2
-    beta = ctx.beta
-    lambda = ctx.lambda
-    gamma1 = ctx.gamma1
-    gamma2 = ctx.gamma2
+function step!(st::L1L2TVState)
+    T = st.T
+    S = st.S
+    alpha1 = st.alpha1
+    alpha2 = st.alpha2
+    beta = st.beta
+    lambda = st.lambda
+    gamma1 = st.gamma1
+    gamma2 = st.gamma2
 
     function du_a(x_, du, nabladu, phi, nablaphi; g, u, nablau, p1, p2, tdata)
 	m1 = max(gamma1, norm(T(tdata, u) - g))
@@ -245,10 +264,10 @@ function step!(ctx::L1L2TVState)
 
     # solve du
     print("assemble ... ")
-    A, b = assemble(ctx.du.space, du_a, du_l;
-        ctx.g, ctx.u, nablau = nabla(ctx.u), ctx.p1, ctx.p2, ctx.tdata)
+    A, b = assemble(st.du.space, du_a, du_l;
+        st.g, st.u, nablau = nabla(st.u), st.p1, st.p2, st.tdata)
     print("solve ... ")
-    ctx.du.data .= A \ b
+    st.du.data .= A \ b
 
 
     # solve dp1
@@ -259,8 +278,8 @@ function step!(ctx::L1L2TVState)
 	    zero(p1)
 	return -p1 + alpha1 / m1 * (T(tdata, u) + T(tdata, du) - g) - cond
     end
-    interpolate!(ctx.dp1, dp1_update;
-	ctx.g, ctx.u, ctx.p1, ctx.du, ctx.tdata)
+    interpolate!(st.dp1, dp1_update;
+	st.g, st.u, st.p1, st.du, st.tdata)
 
     # solve dp2
     function dp2_update(x_; u, nablau, p2, du, nabladu)
@@ -270,65 +289,65 @@ function step!(ctx::L1L2TVState)
 	    zero(p2)
 	return -p2 + lambda / m2 * (nablau + nabladu) - cond
     end
-    interpolate!(ctx.dp2, dp2_update;
-        ctx.u, nablau = nabla(ctx.u), ctx.p2, ctx.du, nabladu = nabla(ctx.du))
+    interpolate!(st.dp2, dp2_update;
+        st.u, nablau = nabla(st.u), st.p2, st.du, nabladu = nabla(st.du))
 
     # newton update
     theta = 1.
-    ctx.u.data .+= theta * ctx.du.data
-    ctx.p1.data .+= theta * ctx.dp1.data
-    ctx.p2.data .+= theta * ctx.dp2.data
+    st.u.data .+= theta * st.du.data
+    st.p1.data .+= theta * st.dp1.data
+    st.p2.data .+= theta * st.dp2.data
 
     # reproject p1, p2
     # FIXME: the p1 projection destroys the primal reconstruction
-    p1_project!(ctx.p1, ctx.alpha1)
-    p2_project!(ctx.p2, ctx.lambda)
+    p1_project!(st.p1, st.alpha1)
+    p2_project!(st.p2, st.lambda)
 end
 
 "
 2010, Chambolle and Pock: primal-dual semi-implicit algorithm
 2017, Alkämper and Langer: fem dualisation
 "
-function step_pd!(ctx::L1L2TVState; sigma, tau, theta = 1.)
+function step_pd!(st::L1L2TVState; sigma, tau, theta = 1.)
     # note: ignores gamma1, gamma2, beta and uses T = I, lambda = 1, m = 1!
     # changed alpha2 -> alpha2 / 2
     # chambolle-pock require: sigma * tau * L^2 <= 1, L = |grad|
-    if ctx.m != 1 || ctx.lambda != 1. || ctx.beta != 0.
+    if ndims_u_codomain(st) != 1 || st.lambda != 1. || st.beta != 0.
 	error("unsupported parameters")
     end
-    beta = tau * ctx.alpha1 / (1 + tau * ctx.alpha2)
+    beta = tau * st.alpha1 / (1 + tau * st.alpha2)
 
     # u is P1
     # p2 is essentially DP0 (technically may be DP1)
 
     # 1. y update
-    u_new = FeFunction(ctx.u.space) # x_bar only used here
-    u_new.data .= ctx.u.data .+ theta .* ctx.du.data
+    u_new = FeFunction(st.u.space) # x_bar only used here
+    u_new.data .= st.u.data .+ theta .* st.du.data
 
-    p2_next = FeFunction(ctx.p2.space)
+    p2_next = FeFunction(st.p2.space)
 
     function p2_update(x_; p2, nablau)
 	return p2 + sigma * nablau
     end
-    interpolate!(p2_next, p2_update; ctx.p2, nablau = nabla(u_new))
+    interpolate!(p2_next, p2_update; st.p2, nablau = nabla(u_new))
 
-    p2_project!(p2_next, ctx.lambda)
-    ctx.dp2.data .= p2_next.data .- ctx.p2.data
+    p2_project!(p2_next, st.lambda)
+    st.dp2.data .= p2_next.data .- st.p2.data
 
-    ctx.p2.data .+= ctx.dp2.data
+    st.p2.data .+= st.dp2.data
 
     # 2.
     u_a(x, z, nablaz, phi, nablaphi; g, u, p2) =
 	dot(z, phi)
 
     u_l(x, phi, nablaphi; u, g, p2) =
-	(dot(u + tau * ctx.alpha2 * g, phi) + tau * dot(p2, -nablaphi)) /
-	    (1 + tau * ctx.alpha2)
+	(dot(u + tau * st.alpha2 * g, phi) + tau * dot(p2, -nablaphi)) /
+	    (1 + tau * st.alpha2)
 
     # z = 1 / (1 + tau * alpha2) *
     #   (u + tau * alpha2 * g + tau * div(p))
-    z = FeFunction(ctx.u.space)
-    A, b = assemble(z.space, u_a, u_l; ctx.g, ctx.u, ctx.p2)
+    z = FeFunction(st.u.space)
+    A, b = assemble(z.space, u_a, u_l; st.g, st.u, st.p2)
     z.data .= A \ b
 
     function u_update!(u, z, g, beta)
@@ -344,109 +363,109 @@ function step_pd!(ctx::L1L2TVState; sigma, tau, theta = 1.)
 	    end
 	end
     end
-    u_update!(u_new, z, ctx.g, beta)
+    u_update!(u_new, z, st.g, beta)
 
     # 3.
     # note: step-size control not implemented, since \nabla G^* is not 1/gamma continuous
-    ctx.du.data .= u_new.data .- ctx.u.data
-    ctx.u.data .= u_new.data
+    st.du.data .= u_new.data .- st.u.data
+    st.u.data .= u_new.data
 
-    #ctx.du.data .= z.data
-    any(isnan, ctx.u.data) && error("encountered nan data")
-    any(isnan, ctx.p2.data) && error("encountered nan data")
+    #st.du.data .= z.data
+    any(isnan, st.u.data) && error("encountered nan data")
+    any(isnan, st.p2.data) && error("encountered nan data")
 
-    return ctx
+    return st
 end
 
 "
 2010, Chambolle and Pock: accelerated primal-dual semi-implicit algorithm
 "
-function step_pd2!(ctx::L1L2TVState; sigma, tau, theta = 1.)
+function step_pd2!(st::L1L2TVState; sigma, tau, theta = 1.)
     # chambolle-pock require: sigma * tau * L^2 <= 1, L = |grad|
 
     # u is P1
     # p2 is essentially DP0 (technically may be DP1)
 
     # 1. y update
-    u_new = FeFunction(ctx.u.space) # x_bar only used here
-    u_new.data .= ctx.u.data .+ theta .* ctx.du.data
+    u_new = FeFunction(st.u.space) # x_bar only used here
+    u_new.data .= st.u.data .+ theta .* st.du.data
 
-    p1_next = FeFunction(ctx.p1.space)
-    p2_next = FeFunction(ctx.p2.space)
+    p1_next = FeFunction(st.p1.space)
+    p2_next = FeFunction(st.p2.space)
 
     function p1_update(x_; p1, g, u, tdata)
-	ctx.alpha1 == 0 && return zero(p1)
-	return (p1 + sigma * (ctx.T(tdata, u) - g)) /
-	    (1 + ctx.gamma1 * sigma / ctx.alpha1)
+	st.alpha1 == 0 && return zero(p1)
+	return (p1 + sigma * (st.T(tdata, u) - g)) /
+	    (1 + st.gamma1 * sigma / st.alpha1)
     end
-    interpolate!(p1_next, p1_update; ctx.p1, ctx.g, ctx.u, ctx.tdata)
+    interpolate!(p1_next, p1_update; st.p1, st.g, st.u, st.tdata)
 
     function p2_update(x_; p2, nablau)
-	ctx.lambda == 0 && return zero(p2)
+	st.lambda == 0 && return zero(p2)
 	return (p2 + sigma * nablau) /
-	    (1 + ctx.gamma2 * sigma / ctx.lambda)
+	    (1 + st.gamma2 * sigma / st.lambda)
     end
-    interpolate!(p2_next, p2_update; ctx.p2, nablau = nabla(u_new))
+    interpolate!(p2_next, p2_update; st.p2, nablau = nabla(u_new))
 
     # reproject p1, p2
-    p1_project!(p1_next, ctx.alpha1)
-    p2_project!(p2_next, ctx.lambda)
+    p1_project!(p1_next, st.alpha1)
+    p2_project!(p2_next, st.lambda)
 
-    ctx.dp1.data .= p1_next.data .- ctx.p1.data
-    ctx.dp2.data .= p2_next.data .- ctx.p2.data
-    ctx.p1.data .+= ctx.dp1.data
-    ctx.p2.data .+= ctx.dp2.data
+    st.dp1.data .= p1_next.data .- st.p1.data
+    st.dp2.data .= p2_next.data .- st.p2.data
+    st.p1.data .+= st.dp1.data
+    st.p2.data .+= st.dp2.data
 
     # 2. x update
     u_a(x, w, nablaw, phi, nablaphi; g, u, p1, p2, tdata) =
 	dot(w, phi) +
-	tau * ctx.alpha2 * dot(ctx.T(tdata, w), ctx.T(tdata, phi)) +
-	tau * ctx.beta * dot(ctx.S(w, nablaw), ctx.S(phi, nablaphi))
+	tau * st.alpha2 * dot(st.T(tdata, w), st.T(tdata, phi)) +
+	tau * st.beta * dot(st.S(w, nablaw), st.S(phi, nablaphi))
 
     u_l(x, phi, nablaphi; g, u, p1, p2, tdata) =
 	dot(u, phi) - tau * (
-	    dot(p1, ctx.T(tdata, phi)) +
+	    dot(p1, st.T(tdata, phi)) +
 	    dot(p2, nablaphi) -
-	    ctx.alpha2 * dot(g, ctx.T(tdata, phi)))
+	    st.alpha2 * dot(g, st.T(tdata, phi)))
 
-    A, b = assemble(u_new.space, u_a, u_l; ctx.g, ctx.u, ctx.p1, ctx.p2, ctx.tdata)
+    A, b = assemble(u_new.space, u_a, u_l; st.g, st.u, st.p1, st.p2, st.tdata)
     u_new.data .= A \ b
-    ctx.du.data .= u_new.data .- ctx.u.data
-    ctx.u.data .= u_new.data
+    st.du.data .= u_new.data .- st.u.data
+    st.u.data .= u_new.data
 
-    #ctx.du.data .= z.data
-    any(isnan, ctx.u.data) && error("encountered nan data")
-    any(isnan, ctx.p1.data) && error("encountered nan data")
-    any(isnan, ctx.p2.data) && error("encountered nan data")
+    #st.du.data .= z.data
+    any(isnan, st.u.data) && error("encountered nan data")
+    any(isnan, st.p1.data) && error("encountered nan data")
+    any(isnan, st.p2.data) && error("encountered nan data")
 
-    return ctx
+    return st
 end
 
 "
 2004, Chambolle: dual semi-implicit algorithm
 "
-function step_d!(ctx::L1L2TVState; tau)
+function step_d!(st::L1L2TVState; tau)
     # u is P1
     # p2 is essentially DP0 (technically may be DP1)
 
     # TODO: this might not be implementable without higher order elements
     # need grad(div(p))
-    return ctx
+    return st
 end
 
-function solve_primal!(u::FeFunction, ctx::L1L2TVState)
+function solve_primal!(u::FeFunction, st::L1L2TVState)
     u_a(x, u, nablau, phi, nablaphi; g, p1, p2, tdata) =
-	ctx.alpha2 * dot(ctx.T(tdata, u), ctx.T(tdata, phi)) +
-	    ctx.beta * dot(ctx.S(u, nablau), ctx.S(phi, nablaphi))
+	st.alpha2 * dot(st.T(tdata, u), st.T(tdata, phi)) +
+	    st.beta * dot(st.S(u, nablau), st.S(phi, nablaphi))
 
     u_l(x, phi, nablaphi; g, p1, p2, tdata) =
-	-dot(p1, ctx.T(tdata, phi)) - dot(p2, nablaphi) +
-	    ctx.alpha2 * dot(g, ctx.T(tdata, phi))
+	-dot(p1, st.T(tdata, phi)) - dot(p2, nablaphi) +
+	    st.alpha2 * dot(g, st.T(tdata, phi))
 
     # u = -B^{-1} * (T^* p_1 + nabla^* p_2 - alpha2 * T^* g)
     # solve:
     # <B u, phi> = -<p_1, T phi> - <p_2, nablaphi> + alpha_2 * <g, T phi>
-    A, b = assemble(u.space, u_a, u_l; ctx.g, ctx.p1, ctx.p2, ctx.tdata)
+    A, b = assemble(u.space, u_a, u_l; st.g, st.p1, st.p2, st.tdata)
     u.data .= A \ b
 end
 
@@ -458,9 +477,7 @@ function refine_and_estimate_pd(st::L1L2TVState)
     marked_cells = Set(axes(st.mesh.cells, 2))
     mesh_new, fs_new = refine(st.mesh, marked_cells;
         st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2)
-    st_new = L1L2TVState(mesh_new, st.d, st.m, T, nothing, S,
-        st.alpha1, st.alpha2, st.beta, st.lambda,
-        st.gamma1, st.gamma2,
+    st_new = L1L2TVState(st; mesh = mesh_new,
         fs_new.est, fs_new.g, fs_new.u, fs_new.p1, fs_new.p2,
         fs_new.du, fs_new.dp1, fs_new.dp2)
 
@@ -531,7 +548,7 @@ function estimate_res!(st::L1L2TVState)
     fs = (;st.g, st.u, nablau = nabla(st.u), st.p1, st.p2, st.tdata)
     space = st.est.space
 
-    facetV = Dict{Tuple{Int, Int}, MVector{st.m, Float64}}()
+    facetV = Dict{Tuple{Int, Int}, MVector{ndims_u_codomain(st), Float64}}()
 
     for cell in cells(mesh)
         A = SArray{Tuple{ndims_space(mesh), nvertices_cell(mesh)}}(
@@ -601,11 +618,11 @@ estimate_error(st::L1L2TVState) =
     sqrt(sum(x -> x^2, st.est.data) / area(st.mesh))
 
 # minimal Dörfler marking
-function mark(ctx::L1L2TVState; theta=0.5)
-    n = ncells(ctx.mesh)
-    esttotal = sum(ctx.est.data)
+function mark(st::L1L2TVState; theta=0.5)
+    n = ncells(st.mesh)
+    esttotal = sum(st.est.data)
 
-    cellerrors = collect(pairs(ctx.est.data))
+    cellerrors = collect(pairs(st.est.data))
     cellerrors_sorted = sort(cellerrors; lt = (x, y) -> x.second > y.second)
 
     marked_cells = Int[]
@@ -619,23 +636,23 @@ function mark(ctx::L1L2TVState; theta=0.5)
 end
 
 
-function output(ctx::L1L2TVState, filename, fs...)
-    print("save \"$filename\" ... ")
-    vtk = vtk_mesh(filename, ctx.mesh)
+function output(st::L1L2TVState, filename, fs...)
+    println("save \"$filename\" ... ")
+    vtk = vtk_mesh(filename, st.mesh)
     vtk_append!(vtk, fs...)
     vtk_save(vtk)
     return vtk
 end
 
-function primal_energy(ctx::L1L2TVState)
+function primal_energy(st::L1L2TVState)
     function integrand(x; g, u, nablau, tdata)
-	return ctx.alpha1 * huber(norm(ctx.T(tdata, u) - g), ctx.gamma1) +
-	    ctx.alpha2 / 2 * norm(ctx.T(tdata, u) - g)^2 +
-	    ctx.beta / 2 * norm(ctx.S(u, nablau))^2 +
-	    ctx.lambda * huber(norm(nablau), ctx.gamma2)
+	return st.alpha1 * huber(norm(st.T(tdata, u) - g), st.gamma1) +
+	    st.alpha2 / 2 * norm(st.T(tdata, u) - g)^2 +
+	    st.beta / 2 * norm(st.S(u, nablau))^2 +
+	    st.lambda * huber(norm(nablau), st.gamma2)
     end
-    return integrate(ctx.mesh, integrand; ctx.g, ctx.u,
-        nablau = nabla(ctx.u), ctx.tdata)
+    return integrate(st.mesh, integrand; st.g, st.u,
+        nablau = nabla(st.u), st.tdata)
 end
 
 function dual_energy(st::L1L2TVState)
@@ -658,26 +675,34 @@ function dual_energy(st::L1L2TVState)
         w, nablaw = nabla(w), st.p1, st.p2)
 end
 
-norm_l2(f) = sqrt(integrate(f.space.mesh, (x; f) -> dot(f, f); f))
+norm_l2(f) = norm_l2((x; f) -> f, f.space.mesh; f)
+function norm_l2(f::Function, mesh; params...)
+    f_l2 = function(x; params...)
+        fx = f(x; params...)
+        return dot(fx, fx)
+    end
+    res = integrate(mesh, f_l2; params...)
+    return sqrt(res)
+end
 
-norm_step(ctx::L1L2TVState) =
-    sqrt((norm_l2(ctx.du)^2 + norm_l2(ctx.dp1)^2 + norm_l2(ctx.dp2)^2) / area(ctx.mesh))
+norm_step(st::L1L2TVState) =
+    sqrt((norm_l2(st.du)^2 + norm_l2(st.dp1)^2 + norm_l2(st.dp2)^2) / area(st.mesh))
 
-function norm_residual(ctx::L1L2TVState)
-    w = FeFunction(ctx.u.space)
-    solve_primal!(w, ctx)
-    w.data .-= ctx.u.data
+function norm_residual(st::L1L2TVState)
+    w = FeFunction(st.u.space)
+    solve_primal!(w, st)
+    w.data .-= st.u.data
     upart2 = norm_l2(w)^2
 
     function integrand(x; g, u, nablau, p1, p2, tdata)
-	p1part = p1 * max(ctx.gamma1, norm(ctx.T(tdata, u) - g)) -
-	    ctx.alpha1 * (ctx.T(tdata, u) - g)
-	p2part = p2 * max(ctx.gamma2, norm(nablau)) -
-	    ctx.lambda * nablau
+	p1part = p1 * max(st.gamma1, norm(st.T(tdata, u) - g)) -
+	    st.alpha1 * (st.T(tdata, u) - g)
+	p2part = p2 * max(st.gamma2, norm(nablau)) -
+	    st.lambda * nablau
 	return norm(p1part)^2 + norm(p2part)^2
     end
-    ppart2 = integrate(ctx.mesh, integrand; ctx.g, ctx.u,
-        nablau = nabla(ctx.u), ctx.p1, ctx.p2, ctx.tdata)
+    ppart2 = integrate(st.mesh, integrand; st.g, st.u,
+        nablau = nabla(st.u), st.p1, st.p2, st.tdata)
 
     return sqrt(upart2 + ppart2)
 end
@@ -691,51 +716,51 @@ function denoise(img; params...)
     T(tdata, u) = u
     S(u, nablau) = u
 
-    ctx = L1L2TVState(mesh, m; T, tdata = nothing, S, params...)
+    st = L1L2TVState{m}(mesh; T, tdata = nothing, S, params...)
 
-    project_l2_lagrange!(ctx.g, img)
-    #interpolate!(ctx.g, x -> evaluate_bilinear(img, x))
+    project_l2_lagrange!(st.g, img)
+    #interpolate!(st.g, x -> evaluate_bilinear(img, x))
     #m = (size(img) .- 1) ./ 2 .+ 1
-    #interpolate!(ctx.g, x -> norm(x .- m) < norm(m .- 1) / 3)
+    #interpolate!(st.g, x -> norm(x .- m) < norm(m .- 1) / 3)
 
-    save_denoise(ctx, i) =
-	output(ctx, "output/$(ctx.name)_$(lpad(i, 5, '0')).vtu",
-	    ctx.g, ctx.u, ctx.p1, ctx.p2, ctx.est)
+    save_denoise(st, i) =
+	output(st, "output/$(st.name)_$(lpad(i, 5, '0')).vtu",
+	    st.g, st.u, st.p1, st.p2, st.est)
 
-    pvd = paraview_collection("output/$(ctx.name).pvd")
-    pvd[0] = save_denoise(ctx, 0)
+    pvd = paraview_collection("output/$(st.name).pvd")
+    pvd[0] = save_denoise(st, 0)
 
     k = 0
-    println("primal energy: $(primal_energy(ctx))")
+    println("primal energy: $(primal_energy(st))")
     while true
 	while true
 	    k += 1
-	    step!(ctx)
-	    estimate_pd!(ctx)
-	    pvd[k] = save_denoise(ctx, k)
+	    step!(st)
+	    estimate_pd!(st)
+	    pvd[k] = save_denoise(st, k)
 	    println()
 
-	    norm_step_ = norm_step(ctx)
-	    norm_residual_ = norm_residual(ctx)
+	    norm_step_ = norm_step(st)
+	    norm_residual_ = norm_residual(st)
 
-	    println("ndofs: $(ndofs(ctx.u.space)), est: $(norm_l2(ctx.est)))")
-	    println("primal energy: $(primal_energy(ctx))")
+	    println("ndofs: $(ndofs(st.u.space)), est: $(norm_l2(st.est)))")
+	    println("primal energy: $(primal_energy(st))")
 	    println("norm_step: $(norm_step_)")
 	    println("norm_residual: $(norm_residual_)")
 
             norm_step_ <= 1e-1 && break
 	end
-	marked_cells = mark(ctx; theta = 0.5)
+	marked_cells = mark(st; theta = 0.5)
 	println("refining ...")
-	ctx, _ = refine(ctx, marked_cells)
-	test_mesh(ctx.mesh)
+	st, _ = refine(st, marked_cells)
+	test_mesh(st.mesh)
 
-	project_l2_lagrange!(ctx.g, img)
+	project_l2_lagrange!(st.g, img)
 
 	k >= 100 && break
     end
     vtk_save(pvd)
-    return ctx
+    return st
 end
 
 
@@ -749,7 +774,7 @@ function denoise_pd(st, img; df=nothing, algorithm, params_...)
     T(tdata, u) = u
     S(u, nablau) = u
 
-    st = L1L2TVState(mesh, m;
+    st = L1L2TVState{m}(mesh;
         T, tdata = nothing, S,
         params.alpha1, params.alpha2, params.lambda, params.beta,
         params.gamma1, params.gamma2)
@@ -870,13 +895,15 @@ end
 function denoise_approximation(ctx)
     domain_factor = 1 / sqrt(area(ctx.params.mesh))
 
+    m = 1
     T(tdata, u) = u
     S(u, nablau) = u
     #S(u, nablau) = nablau
 
-    st = L1L2TVState(ctx.params.mesh, 1;
+    st = L1L2TVState{m}(ctx.params.mesh;
         T, tdata = nothing, S,
-        ctx.params.alpha1, ctx.params.alpha2, ctx.params.lambda, ctx.params.beta,
+        ctx.params.alpha1, ctx.params.alpha2,
+        ctx.params.lambda, ctx.params.beta,
         ctx.params.gamma1, ctx.params.gamma2)
 
     # primal reconstruction
@@ -912,18 +939,14 @@ function denoise_approximation(ctx)
             marked_cells = Set(axes(st.mesh.cells, 2))
             mesh, fs = refine(st.mesh, marked_cells;
                 st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2, w)
-            st = L1L2TVState(mesh, st.d, st.m, T, nothing, S,
-                st.alpha1, st.alpha2, st.beta, st.lambda,
-                st.gamma1, st.gamma2,
+            st = L1L2TVState(st; mesh,
                 fs.est, fs.g, fs.u, fs.p1, fs.p2, fs.du, fs.dp1, fs.dp2)
             w = fs.w
         end
         #marked_cells = Set(axes(st.mesh.cells, 2))
         #mesh2, fs = refine(st.mesh, marked_cells;
         #    st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2)
-        #st2 = L1L2TVState(mesh2, st.d, st.m, T, nothing, S,
-        #    st.alpha1, st.alpha2, st.beta, st.lambda,
-        #    st.gamma1, st.gamma2,
+        #st2 = L1L2TVState(st; mesh = mesh2,
         #    fs.est, fs.g, fs.u, fs.p1, fs.p2, fs.du, fs.dp1, fs.dp2)
 
         k = 0
@@ -966,9 +989,7 @@ function denoise_approximation(ctx)
                     Set(axes(st.mesh.cells, 2))
                 mesh, fs = refine(st.mesh, marked_cells;
                     st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2, w)
-                st = L1L2TVState(mesh, st.d, st.m, T, nothing, S,
-                    st.alpha1, st.alpha2, st.beta, st.lambda,
-                    st.gamma1, st.gamma2,
+                st = L1L2TVState(st; mesh,
                     fs.est, fs.g, fs.u, fs.p1, fs.p2, fs.du, fs.dp1, fs.dp2)
                 w = fs.w
 
@@ -1023,28 +1044,28 @@ function inpaint(img, imgmask; params...)
     T(tdata, u) = isone(tdata[begin]) ? u : zero(u)
     S(u, nablau) = u
 
-    ctx = L1L2TVState(mesh, m; T, tdata = mask, S, params...)
+    st = L1L2TVState{m}(mesh; T, tdata = mask, S, params...)
 
     # FIXME: currently dual grid only
     interpolate!(mask, x -> imgmask[round.(Int, x)...])
     #interpolate!(mask, x -> abs(x[2] - 0.5) > 0.1)
-    interpolate!(ctx.g, x -> imgmask[round.(Int, x)...] ? img[round.(Int, x)...] :  0.)
+    interpolate!(st.g, x -> imgmask[round.(Int, x)...] ? img[round.(Int, x)...] :  0.)
     m = (size(img) .- 1) ./ 2 .+ 1
-    interpolate!(ctx.g, x -> norm(x .- m) < norm(m) / 3)
+    interpolate!(st.g, x -> norm(x .- m) < norm(m) / 3)
 
     save_inpaint(i) =
-	output(ctx, "output/$(ctx.name)_$(lpad(i, 5, '0')).vtu",
-	    ctx.g, ctx.u, ctx.p1, ctx.p2, ctx.est, mask)
+	output(st, "output/$(st.name)_$(lpad(i, 5, '0')).vtu",
+	    st.g, st.u, st.p1, st.p2, st.est, mask)
 
-    pvd = paraview_collection("output/$(ctx.name).pvd")
+    pvd = paraview_collection("output/$(st.name).pvd")
     pvd[0] = save_inpaint(0)
     for i in 1:3
-	step!(ctx)
-	estimate_pd!(ctx)
+	step!(st)
+	estimate_pd!(st)
 	pvd[i] = save_inpaint(i)
 	println()
     end
-    return ctx
+    return st
 end
 
 function optflow(ctx)
@@ -1052,15 +1073,15 @@ function optflow(ctx)
 	throw(ArgumentError("non-matching image sizes"))
 
     project_image! = project_l2_lagrange!
-    tol_newton = 1e-5 # cauchy criterion for inner newton loop
-
-    m = 2 # range dim of vector field
+    eps_newton = 1e-3 # cauchy criterion for inner newton loop
+    eps_warp = 0.05
+    n_refine = 6
 
     # convert to cartesian coordinates
     imgf0 = from_img(ctx.params.imgf0)
     imgf1 = from_img(ctx.params.imgf1)
 
-    mesh = init_grid(imgf0, (size(imgf0) .÷ 2^2)...)
+    mesh = init_grid(imgf0, floor.(Int, size(imgf0) ./ 2^(n_refine / 2))...)
     mesh_area = area(mesh)
 
     # optflow specific stuff
@@ -1070,7 +1091,8 @@ function optflow(ctx)
     fw = FeFunction(Vg, name = "fw")
 
     st = OptFlowState(mesh;
-        ctx.params.alpha1, ctx.params.alpha2, ctx.params.lambda, ctx.params.beta,
+        ctx.params.alpha1, ctx.params.alpha2,
+        ctx.params.lambda, ctx.params.beta,
         ctx.params.gamma1, ctx.params.gamma2)
 
     function warp!()
@@ -1103,6 +1125,7 @@ function optflow(ctx)
         project_image!(f1, imgf1)
     end
 
+    calc_fdist() = norm_l2((x; f0, fw) -> f0 - fw, mesh; f0, fw)
 
     save_step(i) =
         output(st, joinpath(ctx.outdir, "output_$(lpad(i, 5, '0')).vtu"),
@@ -1115,60 +1138,66 @@ function optflow(ctx)
         pvd[0] = save_step(0)
 
         i = 0
-        k = 0
+        k_newton = 0
+        k_warp = 0
+        k_refine = 0
         while true
+            # interior newton
+            k_newton += 1
             step!(st)
-            k += 1
-            #estimate_pd!(st)
-
             norm_step_ = norm_step(st) / sqrt(mesh_area)
             println("norm_step = $norm_step_")
 
-            # residual is basically unusable: takes long to compute and
-            # oscillates
-            #norm_residual_ = norm_residual(st) / sqrt(mesh_area)
-            #println("norm_residual = $norm_residual_")
-
-            #display(plot(colorflow(to_img(sample(st.u)); ctx.params.maxflow)))
-
             # interior newton stop criterion
-            norm_step_ > tol_newton && k < 10 && continue
-            k = 0
+            norm_step_ > eps_newton && k_newton < 10 && continue
+            k_newton = 0
 
-            display(plot(colorflow(to_img(sample(st.u)); ctx.params.maxflow)))
 
+            # plot
             i += 1
+            display(plot(colorflow(to_img(sample(st.u)); ctx.params.maxflow)))
             pvd[i] = save_step(i)
 
-            # warping stop criterion
+
+            # warp
+            k_warp += 1
+            fdist0 = calc_fdist()
             warp!()
-            if i > 10
-                break
-            end
+            fdist1 = calc_fdist()
+            rel_datachange = (fdist1 - fdist0) / fdist0
+            println("rel data change: $(rel_datachange)")
+
+            # warping stop criterion
+            rel_datachange < -eps_warp && continue
 
             # refinement stop criterion
-            if false
-                println("refine ...")
-                estimate_res!(st)
-                marked_cells = mark(st; theta = 0.5)
-                #marked_cells = Set(axes(mesh.cells, 2))
-                mesh, fs = refine(mesh, marked_cells;
-                    st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2,
-                    st.tdata, f0, f1, fw)
-                st = L1L2TVState(mesh, st.d, st.m, st.T, fs.tdata, st.S,
-                    st.alpha1, st.alpha2, st.beta, st.lambda, st.gamma1, st.gamma2,
-                    fs.est, fs.g, fs.u, fs.p1, fs.p2, fs.du, fs.dp1, fs.dp2)
-                f0, f1, fw = (fs.f0, fs.f1, fs.fw)
-                interpolate_image_data!()
-            end
+            k_refine += 1
+            k_refine > n_refine && break
+            println("refine ...")
+            estimate_res!(st)
+            marked_cells = mark(st; theta = 0.5)
+            #marked_cells = Set(axes(mesh.cells, 2))
+            mesh, fs = refine(mesh, marked_cells;
+                st.est, st.g, st.u, st.p1, st.p2, st.du, st.dp1, st.dp2,
+                st.tdata, f0, f1, fw)
+            st = L1L2TVState(st; mesh, fs.tdata,
+                fs.est, fs.g, fs.u, fs.p1, fs.p2, fs.du, fs.dp1, fs.dp2)
+            f0, f1, fw = (fs.f0, fs.f1, fs.fw)
+            interpolate_image_data!()
         end
     end
-    #display(plot(colorflow(to_img(sample(st.u)); ctx.params.maxflow)))
-
     #CSV.write(joinpath(ctx.outdir, "energies.csv"), df)
-    #saveimg(joinpath(ctx.outdir, "output_glob.png"), fetch_u(states.glob))
-    #savedata(ctx, "data.tex"; lambda=λ, beta=β, tau=τ, maxiters, energymin,
-    #    width=size(fo, 2), height=size(fo, 1))
+
+    u_sampled = sample(st.u)
+    saveimg(joinpath(ctx.outdir, "f0.png"), to_img(imgf0))
+    saveimg(joinpath(ctx.outdir, "f1.png"), to_img(imgf1))
+    saveimg(joinpath(ctx.outdir, "output.png"), colorflow(to_img(u_sampled); ctx.params.maxflow))
+    imgfw = warp_backwards(imgf1, u_sampled)
+    saveimg(joinpath(ctx.outdir, "fw.png"), to_img(imgfw))
+    savedata(joinpath(ctx.outdir, "data.tex");
+        eps_newton, eps_warp, n_refine,
+        st.alpha1, st.alpha2, st.lambda, st.beta, st.gamma1, st.gamma2,
+        width=size(u_sampled, 1), height=size(u_sampled, 2))
     return st
 end
 
@@ -1180,9 +1209,19 @@ function experiment_optflow_middlebury(ctx)
 
     ctx = Util.Context(ctx; imgf0, imgf1, maxflow)
     mkpath(ctx.outdir)
+    saveimg(joinpath(ctx.outdir, "ground_truth.png"), colorflow(gtflow; maxflow))
     return optflow(ctx)
 end
 
+function experiment_optflow_middlebury_all(ctx)
+    for example in ["Dimetrodon", "Grove2", "Grove3", "Hydrangea",
+            "RubberWhale", "Urban2", "Urban3", "Venus"]
+        ctx(experiment_optflow_middlebury, example;
+            alpha1 = 10., alpha2 = 0., lambda = 1., beta = 1e-5,
+            gamma1 = 1e-3, gamma2 = 1e-3)
+    end
+end
+
 
 function test_image(n = 2^6; supersample_factor = 16)
     q = supersample_factor
diff --git a/scripts/util.jl b/scripts/util.jl
index 5eff8241e6029897c1047b0f7433f35518a616d8..2cc4a5b53c78a836320505f6aaa67f58d24194f5 100644
--- a/scripts/util.jl
+++ b/scripts/util.jl
@@ -76,24 +76,8 @@ end
 
 Base.show(io, ::MIME"text/latex", x) = print(io, x)
 
-function Base.show(io::IO, ::MIME"text/latex", x::AbstractFloat)
-    exponent = x == 0. ? 0 : floor(Int, log10(x))
-    if abs(exponent) <= 3
-        print(io, "\\ensuremath{", x, "}")
-    else
-        base = x / 10. ^ exponent
-        print(io, "\\ensuremath{", base, "\\cdot 10^{", exponent, "}}")
-    end
-end
-
-function Base.show(io::IO, ::MIME"text/latex", x::Integer)
-    p = log10(x)
-    pr = round(Int, p)
-    if x == 10^pr
-        print(io, "\\ensuremath{10^{", pr, "}}")
-    else
-        print(io, "\\ensuremath{", x, "}")
-    end
+function Base.show(io::IO, ::MIME"text/latex", x::Union{Integer,AbstractFloat})
+    print(io, "\\pgfmathprintnumber{", x ,"}")
 end
 
 end
diff --git a/src/function.jl b/src/function.jl
index 4b1a820901baa94f65182cd29d680a9da1a4fb59..6e1235494b7220eb0ac1f4a891932ac27791f172 100644
--- a/src/function.jl
+++ b/src/function.jl
@@ -215,6 +215,11 @@ function integrate(mesh::Mesh, expr; params...)
     return val
 end
 
+"""
+    bind!(f::FeFunction, cell)
+
+Prepare `f` for local evaluation on `cell` by e.g. `evaluate`.
+"""
 function bind!(f::FeFunction, cell)
     @boundscheck cell in axes(f.space.dofmap, 3)
     gdofs = SArray{Tuple{prod(f.space.size) * ndofs(f.space.element)}}(
diff --git a/src/mesh.jl b/src/mesh.jl
index d882c262a772fb59091d094c1363a5d0b726e87b..99683e3e4b1cc0e82c2bd7e88fbb8cb5e199f628 100644
--- a/src/mesh.jl
+++ b/src/mesh.jl
@@ -335,15 +335,13 @@ function refine!(hmesh::HMesh, marked_cells::Set; fs...)
 	    new_f.data[new_gdofs] .= f.data[gdofs]
 	end
     end
-    return new_fs
+    return new_mesh, new_fs
 end
 
 "refine by creating temporary hierarchical mesh on the fly"
 function refine(mesh::Mesh, marked_cells; fs...)
     hmesh = HMesh(mesh)
-    fs_new = refine!(hmesh, Set(marked_cells); fs...)
-    mesh_new = sub_mesh(hmesh)
-    return mesh_new, fs_new
+    return refine!(hmesh, Set(marked_cells); fs...)
 end
 
 function geo_tolocal(A, v)