diff --git a/section_4.1_compute_visualize.py b/section_4.1_compute_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f6dcfd2ea462efd588586c91d129fadd59cac27
--- /dev/null
+++ b/section_4.1_compute_visualize.py
@@ -0,0 +1,105 @@
+# DATA-DRIVEN KERNEL DESIGNS FOR OPTIMIZED GREEDY SCHEMES: A MACHINE LEARNING PERSPECTIVE by
+# TIZIAN WENZEL, FRANCESCO MARCHETTI, EMMA PERRACCHIONE
+# Code related to the numerical experiment within Section 4.1. "Function approximation
+# on the unit cube" to produce the plots within Figure 2.
+
+
+import torch
+from P36_Francesco_Emma.utilities.main_function import run_everything, run_cross_validation
+from P36_Francesco_Emma.utilities.hyperparameters import dic_hyperparams
+
+import numpy as np
+from matplotlib import pyplot as plt
+from scipy import io
+import os
+
+
+## Some settings
+# name_dataset = 'example_5d_faster_conv'
+name_dataset = 'example_6d_kink'
+# name_dataset = 'example_7d_semiactive'
+
+
+## Load the hyperparameters
+hyperparameter = dic_hyperparams[name_dataset]
+
+
+## Run everything
+A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, \
+    array_concatenate, _, dic_timings_2L = run_everything(
+    name_dataset,
+    hyperparameter.maxIter_vkoga, hyperparameter.N_points,
+    hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,
+    hyperparameter.learning_rate, hyperparameter.n_epochs,
+    hyperparameter.batch_size, hyperparameter.n_folds,
+    flag_initialize_diagonal=hyperparameter.flag_initialize_diagonal,
+    flag_symmetric_A=hyperparameter.flag_symmetric,
+    flag_gaussian=hyperparameter.flag_gaussian)
+
+
+## Run cross-validation
+array_eps, array_cv_f, array_cv_f_val, _, list_timings_1L = run_cross_validation(
+    name_dataset, # required only for hyperparameters
+    data, hyperparameter.maxIter_vkoga,
+    flag_gaussian=hyperparameter.flag_gaussian, n_cross_val=hyperparameter.n_cross_val)
+
+
+
+## Store in matlab for beautiful tikzfigure plots
+path_for_results = os.getcwd() + '/P36_Francesco_Emma/paper_experiments/results/'
+os.makedirs(path_for_results, exist_ok=True)
+io.savemat(path_for_results + name_dataset + '.mat',
+           dict(array_concatenate=array_concatenate,
+                list_objectives=model.list_obj,
+                train_hist_deep_f=model_vkoga1.train_hist['f'],
+                train_hist_deep_f_val=model_vkoga1.train_hist['f val'],
+                train_hist_std_f=model_vkoga2.train_hist['f'],
+                train_hist_std_f_val=model_vkoga2.train_hist['f val'],
+                array_eps=array_eps,
+                array_cv_f=array_cv_f,
+                array_cv_f_val=array_cv_f_val,
+                dic_timings_2L = dic_timings_2L,
+                list_timings_1L = list_timings_1L))
+
+# in Matlab:
+# results = load('gong.mat')
+
+
+## Some plots
+R = np.linspace(0, 1, int(1.5*array_cv_f.shape[1]))
+array_color=plt.cm.hsv(R)
+
+# Visualization of the decay of the L-infty error
+plt.figure(1)
+plt.clf()
+plt.plot(model_vkoga1.train_hist['f'], 'k')
+# plt.plot(model_vkoga1.train_hist['f val'], 'k--')
+for idx_para, para_ep in enumerate(array_eps):
+    plt.plot(array_cv_f[:, idx_para], color=array_color[idx_para, :])
+    # plt.plot(array_cv_f[:, idx_para], '--', color=array_color[idx_para, :])
+plt.title('training error')
+plt.xscale('log')
+plt.yscale('log')
+plt.legend(['2layered'] + [para_ep for para_ep in np.round(array_eps, 3)])
+plt.draw()
+
+
+# Visualization of the evolution of some matrix entries
+n_epochs = array_concatenate.shape[-1]
+
+plt.figure(2)
+plt.clf()
+list_legend = []
+for k in range(array_concatenate.shape[0]):
+    plt.plot(np.arange(1, n_epochs + 1), array_concatenate[k, k, :])
+    list_legend.append('A[{},{}]'.format(k, k))
+plt.plot(np.arange(1, n_epochs + 1), array_concatenate[0, 1, :], '--')
+plt.plot(np.arange(1, n_epochs + 1), array_concatenate[1, 0, :], '--')
+plt.legend(list_legend + ['A[0,1]', 'A[1,0]'])
+plt.title('matrix entries')
+plt.xscale('log')
+plt.xlabel('iterations')
+plt.draw()
+
+
+
diff --git a/section_4.2_compute.py b/section_4.2_compute.py
new file mode 100644
index 0000000000000000000000000000000000000000..150836e0c5d5269a0b738ca14a38f5db5b4385f3
--- /dev/null
+++ b/section_4.2_compute.py
@@ -0,0 +1,75 @@
+# DATA-DRIVEN KERNEL DESIGNS FOR OPTIMIZED GREEDY SCHEMES: A MACHINE LEARNING PERSPECTIVE by
+# TIZIAN WENZEL, FRANCESCO MARCHETTI, EMMA PERRACCHIONE
+# Code related to the numerical experiment within Section 4.2. "Machine learning data
+# sets", especially to produce the results for the Figures 3-5.
+
+
+import torch
+from P36_Francesco_Emma.utilities.main_function import run_everything, run_cross_validation
+from P36_Francesco_Emma.utilities.hyperparameters import dic_hyperparams
+
+import numpy as np
+from matplotlib import pyplot as plt
+from scipy import io
+import os
+
+
+list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
+                 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count',
+                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+
+
+## Loop over reruns and datasets
+for idx_indices in [0, 1, 2, 3, 4]:
+    for idx_dataset, name_dataset in enumerate(list_datasets):
+
+        ## Load the hyperparameters
+        hyperparameter = dic_hyperparams[name_dataset]
+
+
+        ## Run everything
+        A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, \
+            array_concatenate, array_test_rmse_deep, dic_timings_2L = run_everything(
+            name_dataset,
+            hyperparameter.maxIter_vkoga, hyperparameter.N_points,
+            hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,
+            hyperparameter.learning_rate, hyperparameter.n_epochs,
+            hyperparameter.batch_size, hyperparameter.n_folds,
+            flag_initialize_diagonal=hyperparameter.flag_initialize_diagonal,
+            flag_symmetric_A=hyperparameter.flag_symmetric,
+            flag_gaussian=hyperparameter.flag_gaussian,
+            flag_optim_verbose=False,
+            flag_std_vkoga=False,
+            idx_rerun=idx_indices)
+
+
+        ## Run cross-validation
+        array_eps, array_cv_f, array_cv_f_val, array_test_rmse_cv, list_timings_1L = \
+            run_cross_validation(
+                name_dataset, # required only for hyperparameters
+                data, hyperparameter.maxIter_vkoga,
+                flag_gaussian=hyperparameter.flag_gaussian,
+                n_cross_val=hyperparameter.n_cross_val)
+
+
+        ## Store in matlab for beautiful tikzfigure plots
+        path_for_results = os.getcwd() + '/P36_Francesco_Emma/paper_experiments/results/'
+        os.makedirs(path_for_results, exist_ok=True)
+        io.savemat(path_for_results + name_dataset + '_{}'.format(idx_indices) + '.mat',
+                   dict(array_concatenate=array_concatenate,
+                        list_objectives=model.list_obj,
+                        train_hist_deep_f=model_vkoga1.train_hist['f'],
+                        train_hist_deep_f_val=model_vkoga1.train_hist['f val'],
+                        train_hist_std_f=model_vkoga2.train_hist['f'],
+                        train_hist_std_f_val=model_vkoga2.train_hist['f val'],
+                        array_eps=array_eps,
+                        array_cv_f=array_cv_f,
+                        array_cv_f_val=array_cv_f_val,
+                        array_test_rmse_deep=array_test_rmse_deep,
+                        array_test_rmse_cv=array_test_rmse_cv,
+                        dic_timings_2L=dic_timings_2L,
+                        list_timings_1L=list_timings_1L))
+
+        # in Matlab:
+        # results = load('gong.mat')
diff --git a/section_4.2_visualize.py b/section_4.2_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..d286d581f736bc5208317f6116b92894a3f1ada9
--- /dev/null
+++ b/section_4.2_visualize.py
@@ -0,0 +1,115 @@
+# DATA-DRIVEN KERNEL DESIGNS FOR OPTIMIZED GREEDY SCHEMES: A MACHINE LEARNING PERSPECTIVE by
+# TIZIAN WENZEL, FRANCESCO MARCHETTI, EMMA PERRACCHIONE
+# Code related to the numerical experiment within Section 4.2. "Machine learning data
+# sets", especially to produce Figure 5.
+
+
+import numpy as np
+from matplotlib import pyplot as plt
+from scipy import io
+import os
+import scipy
+import utilities
+
+
+## Some settings
+list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
+                 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count',
+                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+
+basepath = utilities.get_basepath()
+path_for_results = basepath + 'P36_Francesco_Emma/paper_experiments/results_5reruns/'
+
+
+## Initialize dictionaries to store several quantities
+# Write all results into a dictionary:
+#   dic_eigvals: ratio of cumsummed absolute singular values
+#   dic_accuracy_deep: test rmse of 2L model
+#   dic_accuracy_beststd: test rmse of best standard model
+dic_singular_vals = {}
+dic_accuracy_deep = {}
+dic_accuracy_beststd = {}
+for name_dataset in list_datasets:
+    dic_singular_vals[name_dataset] = {}
+    dic_accuracy_deep[name_dataset] = {}
+    dic_accuracy_beststd[name_dataset] = {}
+
+
+## Read all file, perform computations, save results
+flag_first_npy_file = True
+for idx_file, file in enumerate(os.listdir(path_for_results)):
+    if file == 'README.md' or file == 'save':
+        continue
+
+    name_dataset = '_'.join(file.split('_')[:-1])
+    idx_rerun = int(file[-5])
+
+    # Compute and store singular value ratios - enforce that they are ordered!
+    results = scipy.io.loadmat(path_for_results + file)
+    A_final = results['array_concatenate'][:, :, -1]
+
+    singular_vals = np.linalg.svd(A_final)[1]
+    abs_sing_vals = np.abs(singular_vals)
+    ordered_abs_sing_vals = np.sort(abs_sing_vals)[::-1]
+
+    ratio = np.cumsum(ordered_abs_sing_vals) / np.sum(ordered_abs_sing_vals)
+
+    dic_singular_vals[name_dataset][idx_rerun] = ratio
+
+    # Compute and store accuracies
+    dic_accuracy_deep[name_dataset][idx_rerun] = results['array_test_rmse_deep']
+    dic_accuracy_beststd[name_dataset][idx_rerun] = np.min(results['array_test_rmse_cv'], axis=0)
+
+
+## Calculate ranking where 2L performs best to worst: Use the average mean of improvement for this
+array_ratio = np.zeros(len(list_datasets))
+
+for idx_dataset, name_dataset in enumerate(list_datasets):
+
+    array_deep = np.stack([dic_accuracy_deep[name_dataset][idx_rerun] for idx_rerun in range(5)])
+    array_beststd = np.stack([dic_accuracy_beststd[name_dataset][idx_rerun] for idx_rerun in range(5)])
+
+    ratio = np.mean(np.mean(array_deep, axis=0) / np.mean(array_beststd, axis=0))
+
+    array_ratio[idx_dataset] = ratio
+
+indices_sorted = np.argsort(array_ratio)
+
+
+## Print the calculated ratio: This shows, when 2L is superior
+for idx_sorted in indices_sorted:
+    print('{:20}'.format(list_datasets[idx_sorted]), np.round(array_ratio[idx_sorted], 5))
+indices_sorted = indices_sorted[:-3]     # cut off last three datasets
+
+
+## Visualization of the ratio of singular values compared to sum of all singular values
+R = np.linspace(0, 1, int(1.0*len(list_datasets)))
+array_color=plt.cm.hsv(R)
+
+dic_to_store = {}
+
+list_legend = []
+plt.figure(2)
+plt.clf()
+idx_color = len(indices_sorted)
+for idx_sorted in indices_sorted[::-1]:
+    name_dataset = list_datasets[idx_sorted]
+    array = np.stack([dic_singular_vals[name_dataset][idx_rerun] for idx_rerun in range(5)])
+    plt.errorbar(np.linspace(0, 1, array.shape[1]), np.mean(array, axis=0), yerr=np.std(array, axis=0),
+                 color=array_color[idx_color, :])
+    # list_legend.append('{} '.format(idx_color) + name_dataset)
+    list_legend.append(name_dataset)
+    idx_color -= 1
+
+    dic_to_store[name_dataset] = array
+plt.legend(list_legend)
+
+
+## Store in matlab for beautiful tikzfigure plots
+# io.savemat('dic_singular_value_ratio.mat', dic_to_store)
+
+
+
+
+
diff --git a/section_4.3_compute.py b/section_4.3_compute.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e6a3bfcc492b8b2fabb9bb16e895032d22911a
--- /dev/null
+++ b/section_4.3_compute.py
@@ -0,0 +1,109 @@
+# DATA-DRIVEN KERNEL DESIGNS FOR OPTIMIZED GREEDY SCHEMES: A MACHINE LEARNING PERSPECTIVE by
+# TIZIAN WENZEL, FRANCESCO MARCHETTI, EMMA PERRACCHIONE
+# Code related to the numerical experiment within Section 4.3. "Stability of the kernel
+# optimization", especially to produce the results for Figure 6.
+
+# Similar to 03_4_stability_investigations.py, but now also running the VKOGA after the kernel optimization.
+# I ran this file on ic6, ic7, ic8 to compute the results which are collected in the folder results_stability.
+
+import os
+import numpy as np
+from scipy.stats import ortho_group  # Requires version 0.18 of scipy
+import pickle
+import utilities
+
+from P36_Francesco_Emma.utilities.dataset_collection import Dataset
+from P36_Francesco_Emma.utilities.hyperparameters import dic_hyperparams
+from P36_Francesco_Emma.utilities.main_function import run_everything
+
+
+## Some settings
+list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
+
+basepath = utilities.get_basepath()
+path_for_results = basepath + 'P36_Francesco_Emma/paper_experiments/results_stability/'
+
+
+flag_gaussian = False
+n_reruns = 5
+
+list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
+                 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count',
+                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+
+
+## Loop over all datasets
+for idx_dataset, name_dataset in enumerate(list_datasets):
+
+    # Load dataset
+    dataset = Dataset()
+    X, y = dataset.get_data(name_dataset)
+    hyperparameter = dic_hyperparams[name_dataset]
+
+    # list_A_start = [np.eye(X.shape[1])] + [ortho_group.rvs(dim=X.shape[1]) for _ in range(n_reruns - 1)]
+    list_A_start = [np.eye(X.shape[1]) for _ in range(n_reruns)]
+
+    # Go only for a fixed set of indices - do not aim at error bars plot!
+    list_idx = [1, 2, 3, 4]
+
+    ## Loop to compute and store all initial and final matrices for the different nfolds optimizations
+    dic_results = {}
+    for idx_index in list_idx:
+        dic_results[idx_index] = {}
+
+        list_nfolds = [64, 32, 16, 8, 4]
+        assert len(list_nfolds) == len(list_A_start), 'list_nfolds is not of same length as list_A_start'
+
+        ## Load the hyperparameters
+        hyperparameter = dic_hyperparams[name_dataset]
+
+        ## Run everything for 2L
+        for nfold in list_nfolds:
+
+            A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep = run_everything(
+                name_dataset,
+                hyperparameter.maxIter_vkoga, hyperparameter.N_points,
+                hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,
+                hyperparameter.learning_rate, hyperparameter.n_epochs,
+                hyperparameter.batch_size,
+                n_folds=nfold,
+                flag_initialize_diagonal=hyperparameter.flag_initialize_diagonal,
+                flag_symmetric_A=hyperparameter.flag_symmetric,
+                flag_gaussian=hyperparameter.flag_gaussian,
+                flag_optim_verbose=False,
+                flag_std_vkoga=False,
+                idx_rerun=idx_index)
+
+            # Store the initial and the final matrix
+            dic_results[idx_index][nfold] = [A_start, A_optimized, model_vkoga1.train_hist['f'],
+                                             array_test_rmse_deep]
+
+
+        ## Run everything for 1L (there is only one!) --> nfold = 0
+        A_start, A_optimized, _, model_vkoga1, _, _, _, array_test_rmse_deep, _ = run_everything(
+            name_dataset,
+            hyperparameter.maxIter_vkoga, hyperparameter.N_points,
+            hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,
+            hyperparameter.learning_rate, n_epochs=0,
+            batch_size=hyperparameter.batch_size,
+            n_folds=list_nfolds[0], # 64 <-- does not matter as long as we use n_epochs=0
+            flag_initialize_diagonal=hyperparameter.flag_initialize_diagonal,
+            flag_symmetric_A=hyperparameter.flag_symmetric,
+            flag_gaussian=hyperparameter.flag_gaussian,
+            flag_optim_verbose=False,
+            flag_std_vkoga=False,
+            idx_rerun=idx_index)
+
+        # Store the initial and the final matrix
+        dic_results[idx_index][0] = [A_start, A_optimized, model_vkoga1.train_hist['f'],
+                                     array_test_rmse_deep]
+
+
+        ## Pickle store dic results
+        os.makedirs(path_for_results, exist_ok=True)
+        filename = 'dic_results_{}_'.format(idx_index) + name_dataset + '.pkl'
+        file_pickle = os.path.join(path_for_results, filename)
+        with open(file_pickle, 'wb') as f:
+            pickle.dump(dic_results, f)
+
diff --git a/section_4.3_visualize.py b/section_4.3_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2ce7d26ed5c6bda9807e1e45b0a79f7e12264c
--- /dev/null
+++ b/section_4.3_visualize.py
@@ -0,0 +1,247 @@
+# DATA-DRIVEN KERNEL DESIGNS FOR OPTIMIZED GREEDY SCHEMES: A MACHINE LEARNING PERSPECTIVE by
+# TIZIAN WENZEL, FRANCESCO MARCHETTI, EMMA PERRACCHIONE
+# Code related to the numerical experiment within Section 4.3. "Stability of the kernel
+# optimization", especially to produce the plots for Figure 6.
+
+# Evaluation and analysis of the results which are obtained from 04_1_stability_investigations.py.
+# Only the wecs dataset is a bit weird: Here I do not understand what is going on:
+# Only the first singular vectors are aligned, the other ones are not!
+
+import os
+from scipy import io
+import numpy as np
+from scipy.stats import ortho_group  # Requires version 0.18 of scipy
+from matplotlib import pyplot as plt
+import pickle
+import scipy
+import utilities
+
+
+## Some settings
+list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
+
+basepath = utilities.get_basepath()
+path_for_results = basepath + 'P36_Francesco_Emma/paper_experiments/results_stability/'
+
+
+## First, collect results from all files in a common dictionary
+dic_results = {}        # dic_results --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep]
+for idx_file, file in enumerate(os.listdir(path_for_results)):
+    idx_index = int(file.split('_')[2])
+    name_dataset = ('_'.join(file.split('_')[3:])).split('.')[:-1][0]
+    print(name_dataset)
+
+    # Load file
+    with open(path_for_results + file, 'rb') as f:
+        dic_test_preds = pickle.load(f)
+
+    # Key name_dataset already available?
+    if name_dataset not in dic_results:
+        dic_results[name_dataset] = {}
+
+    # Enter all the data
+    for idx_index in dic_test_preds:
+        dic_results[name_dataset][idx_index] = dic_test_preds[idx_index]
+
+
+
+## Second, loop over name_dataset and compute singular value decompositions
+dic_singvals = {}               # dic_singvals --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep]
+dic_singvecs = {}
+
+for idx_dataset, name_dataset in enumerate(dic_results):
+
+    # if name_dataset != 'wecs' and name_dataset != 'ct' and name_dataset != 'sgemm':
+    #     continue
+
+    dic_singvals[name_dataset] = {}
+    dic_singvecs[name_dataset] = {}
+
+    for idx_index in dic_results[name_dataset]:
+
+        dic_singvals[name_dataset][idx_index] = {}
+        dic_singvecs[name_dataset][idx_index] = {}
+
+        for nfold in dic_results[name_dataset][idx_index]:
+
+            # Compute singular values of finally optimized matrix
+            A_start = dic_results[name_dataset][idx_index][nfold][0]
+            A_optimized = dic_results[name_dataset][idx_index][nfold][1]
+
+            # Use singular value decomposition
+            u, singular_vals, vh = np.linalg.svd(A_optimized)  # A == (u @ np.diag(s)) @ vh
+            indices = np.argsort(np.abs(singular_vals))
+            indices = indices[::-1]
+
+            singular_vals = singular_vals[indices]
+            singular_vecs = u[:, indices]
+
+            dic_singvals[name_dataset][idx_index][nfold] = singular_vals
+            dic_singvecs[name_dataset][idx_index][nfold] = singular_vecs
+
+
+
+## Third, loop over name_dataset and compute quantities for plots and visualize them (average over the reruns)
+max_dim = 15  # maximum dimension to which we want to check similarity
+
+nfold_ref = 64  # we will use this as a reference model!
+list_idx = [0, 1, 2, 3, 4]
+
+for idx_dataset, name_dataset in enumerate(list(dic_results.keys())):
+
+    # Skip several datasets
+    if name_dataset in ['methane', 'poker', 'protein']:
+        continue
+    if name_dataset not in ['kegg_undir_uci', 'fried', 'ct', 'mlr_knn_rng']:
+        continue
+
+    max_dim1 = min([max_dim, dic_singvals[name_dataset][idx_index][nfold].shape[0]])
+
+
+    ## Compute quantities for the subsequent plots
+    dic_alignment = {}
+    dic_singvalsarray = {}
+
+    for idx_nfold, nfold in enumerate(list(dic_results[name_dataset][0].keys())):
+
+        # if nfold == nfold_ref:
+        #     continue  # this is our reference model
+
+
+        # Average results over the different runs (i.e. the different training test splits)
+        list_list_similarities = []
+        for idx_index in list_idx:
+
+            list_similarity = []
+            for dim in range(max_dim1):
+                array = scipy.linalg.subspace_angles(dic_singvecs[name_dataset][idx_index][nfold][:, :(dim + 1)],
+                                                            dic_singvecs[name_dataset][idx_index][nfold_ref][:, :(dim + 1)])
+
+                # Maybe modify this measure here? 90 degress means orthogonality!!
+                similarity = array[0]
+                list_similarity.append(np.rad2deg(similarity))
+
+            list_list_similarities.append(list_similarity)
+
+        array_angles = np.stack(list_list_similarities)
+
+        # Write values to dic - to store it: Added "A" in the beginning because otherwise matlab raises an error
+        dic_alignment['A{}'.format(nfold)] = array_angles
+        dic_singvalsarray['A{}'.format(nfold)] = np.stack([dic_singvals[name_dataset][idx_index][nfold] for idx_index in dic_singvals[name_dataset]])
+
+
+    ## Errorbar plot of subspace angles
+    plt.figure(800 + idx_dataset)
+    plt.clf()
+    list_legend = []
+    for idx_nfold, nfold in enumerate(list(dic_results[name_dataset][0].keys())):
+        plt.errorbar(x=np.arange(1, 1 + max_dim1) + (-2 + idx_nfold) * .07,  # shift of 0.07 otherwise errorbars overlap
+                     y=np.mean(dic_alignment['A{}'.format(nfold)], axis=0),
+                     yerr=np.std(dic_alignment['A{}'.format(nfold)], axis=0))
+        list_legend.append('{} vs {}'.format(nfold_ref, nfold if not nfold == 0 else '1L'))
+
+    plt.title(name_dataset + '(dim={}): Alignment of subspaces'.format(dic_singvals[name_dataset][idx_index][nfold].shape[0]))
+    plt.legend(list_legend)
+
+
+    ## Errorbar plot of singular values
+    plt.figure(900 + idx_dataset)
+    plt.clf()
+    list_legend = []
+    for idx_nfold, nfold in enumerate(list(dic_results[name_dataset][0].keys())):
+        plt.errorbar(x=np.arange(1, 1 + dic_singvalsarray['A0'].shape[1]) + (-2 + idx_nfold) * .07,
+                     y=np.mean(dic_singvalsarray['A{}'.format(nfold)], axis=0),
+                     yerr=np.std(dic_singvalsarray['A{}'.format(nfold)], axis=0))
+        list_legend.append('{} vs {}'.format(nfold_ref, nfold if not nfold == 0 else '1L'))
+
+    plt.title(name_dataset + '(dim={}): Singular values'.format(dic_singvals[name_dataset][idx_index][nfold].shape[0]))
+    plt.legend(list_legend)
+    plt.yscale('log')
+
+
+    ## Store in matlab for beautiful tikzfigure plots
+    io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment)
+    io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray)
+
+    if idx_dataset > 20:
+        break
+
+
+
+# =============================================================================
+# ============ The following code is meant for some investigations ============
+# =============================================================================
+
+
+
+## Single plot: Compute the distances between the largest eigenspaces. We use nfolds = 64 as reference!
+max_dim = 15  # maximum dimension to which we want to check similarity
+name_dataset = 'wecs'
+
+nfold_ref = 64  # we will use this as a reference model!
+list_idx = [0, 1, 2, 3, 4]
+
+for idx_index in list_idx:
+
+    plt.figure(110 + idx_index)
+    plt.clf()
+    list_legend = []
+
+    for nfold in dic_results[name_dataset][idx_index]:
+        if nfold == nfold_ref:
+            continue  # this is our reference model
+
+        list_similarity = []
+        for dim in range(max_dim):
+            array_angles = scipy.linalg.subspace_angles(dic_singvecs[name_dataset][idx_index][nfold][:, :(dim + 1)],
+                                                        dic_singvecs[name_dataset][idx_index][nfold_ref][:, :(dim + 1)])
+
+            # Maybe modify this measure here? 90 degress means orthogonality!!
+            similarity = array_angles[0]
+
+            list_similarity.append(np.rad2deg(similarity))
+
+        plt.plot(list(range(max_dim)), list_similarity)
+        list_legend.append('{} vs {}'.format(nfold_ref, nfold))
+
+    # energy_captured = np.sum(np.abs(dic_singvals[idx_index][idx_nfold][-10:])) / np.sum(np.abs(dic_singvals[idx_index][idx_nfold]))
+    # plt.title(name_dataset + ': energy captured within {} dims: {:.3f}%'.format(max_dim, 100*energy_captured))
+    plt.title(name_dataset + '(dim={}): Alignment of subspaces'.format(dic_singvals[name_dataset][idx_index][nfold].shape[0]))
+    plt.legend(list_legend)
+
+
+## Single plot: Plot the first few singular vectors
+dic_colors = {0: 'k', 4: 'm', 8: 'g', 16: 'r', 32: 'b', 64: 'y'}
+max_vectors = 3  # maximum dimension to which we want to check similarity
+name_dataset = 'wecs'
+
+list_idx = [0, 1, 2, 3, 4]
+
+for idx_vector in range(max_vectors):
+
+    plt.figure(1000 + idx_vector)
+    plt.clf()
+    list_legend = []
+
+    flag_legend = True
+    for idx_index in list_idx:
+
+        for nfold in dic_results[name_dataset][idx_index]:
+
+            if flag_legend:
+                list_legend.append(nfold)
+
+            # ToDo: Flip if necessary
+            plt.plot(dic_singvecs[name_dataset][idx_index][nfold][:, idx_vector], color=dic_colors[nfold])
+
+
+        flag_legend = False
+
+
+    # energy_captured = np.sum(np.abs(dic_singvals[idx_index][idx_nfold][-10:])) / np.sum(np.abs(dic_singvals[idx_index][idx_nfold]))
+    # plt.title(name_dataset + ': energy captured within {} dims: {:.3f}%'.format(max_dim, 100*energy_captured))
+    plt.title(name_dataset + '(dim={}): Plot of singular vectors'.format(dic_singvals[name_dataset][idx_index][nfold].shape[0]))
+    plt.legend(list_legend)
+
+
+
diff --git a/utils/cv_rippa_ext.py b/utils/cv_rippa_ext.py
new file mode 100644
index 0000000000000000000000000000000000000000..d053604e4172f83bd77749b4b3d3b7e519a2bf99
--- /dev/null
+++ b/utils/cv_rippa_ext.py
@@ -0,0 +1,83 @@
+# Code copied from P07_ML_MORE/Utilities/cv_rippa_ext.py and slightly modified.
+
+import torch
+
+
+def compute_cv_loss_via_rippa_ext(kernel, X, y, n_folds, reg_for_matrix_inversion):
+    """
+    Implementation with the need to provide a kernel and points
+    """
+
+    # Some precomputations
+    kernel_matrix = kernel(X, X) + reg_for_matrix_inversion * torch.eye(X.shape[0])
+    inv_kernel_matrix = torch.inverse(kernel_matrix)
+    coeffs = torch.linalg.solve(kernel_matrix, y) #[0]
+
+    # Some initializations and preparations: It is required that n_folds divides y.shape[0] without remainder
+    array_error = torch.zeros(y.shape[0], 1)
+    n_per_fold = int(y.shape[0] / n_folds)
+    indices = torch.arange(0, y.shape[0]).view(n_per_fold, n_folds)
+
+    # Standard Rippa's scheme
+    if n_folds == y.shape[0]:
+        array_error = coeffs / torch.diag(inv_kernel_matrix)
+
+    # Extended Rippa's scheme
+    else:
+        for j in range(n_folds):
+            # inv_kernel_matrix_loc = inv_kernel_matrix[indices[:, j], indices[:, j]]
+            inv_kernel_matrix_loc1 = inv_kernel_matrix[indices[:, j], :]
+            inv_kernel_matrix_loc = inv_kernel_matrix_loc1[:, indices[:, j]]
+
+            array_error[j * n_per_fold: (j+1) * n_per_fold, 0] = \
+                (torch.solve(coeffs[indices[:, j]], inv_kernel_matrix_loc)[0]).view(-1)
+
+    cv_error_sq = torch.sum(array_error ** 2) / array_error.shape[0]
+
+    return cv_error_sq, array_error
+
+
+def compute_cv_loss_via_rippa_ext_2(kernel_matrix, y, n_folds, reg_for_matrix_inversion):
+    """
+    Implementation without the need to provide a kernel and points: Simply provide the kernel matrix
+    """
+
+    # Some precomputations
+    kernel_matrix_reg = kernel_matrix + reg_for_matrix_inversion * torch.eye(kernel_matrix.shape[0])
+    inv_kernel_matrix = torch.inverse(kernel_matrix_reg)
+    coeffs = torch.linalg.solve(kernel_matrix_reg, y) #[0]
+
+    # Some initializations and preparations: It is required that n_folds divides y.shape[0] without remainder
+    array_error = torch.zeros(y.shape[0], 1)
+    n_per_fold = int(y.shape[0] / n_folds)
+    indices = torch.arange(0, y.shape[0]).view(n_per_fold, n_folds)
+
+    # Standard Rippa's scheme
+    if n_folds == y.shape[0]:
+        array_error = coeffs / torch.diag(inv_kernel_matrix).view(-1,1)
+
+    # Extended Rippa's scheme
+    else:
+        for j in range(n_folds):
+            # inv_kernel_matrix_loc = inv_kernel_matrix[indices[:, j], indices[:, j]]
+            inv_kernel_matrix_loc1 = inv_kernel_matrix[indices[:, j], :]
+            inv_kernel_matrix_loc = inv_kernel_matrix_loc1[:, indices[:, j]]
+
+            # array_error[j * n_per_fold: (j+1) * n_per_fold, 0] = \
+            #     (torch.solve(coeffs[indices[:, j]], inv_kernel_matrix_loc)[0]).view(-1)
+            array_error[j * n_per_fold: (j+1) * n_per_fold, 0] = \
+                (torch.linalg.solve(inv_kernel_matrix_loc, coeffs[indices[:, j]])).view(-1)
+
+        # list_for_block = []
+        # for j in range(n_folds):
+        #     inv_kernel_matrix_loc1 = inv_kernel_matrix[j*n_per_fold : (j+1)*n_per_fold, :]
+        #     inv_kernel_matrix_loc = inv_kernel_matrix_loc1[:, j*n_per_fold : (j+1)*n_per_fold]
+        #     list_for_block.append(inv_kernel_matrix_loc)
+        #
+        # from scipy.linalg import block_diag
+        # block_matrix = block_diag(list_for_block) # blockmatrix, each block is of size n_per_fold x n_per_fold!
+        # array_error2 = torch.linalg.solve(block_matrix, coeffs)
+
+    cv_error_sq = torch.sum(array_error ** 2) / array_error.numel()
+
+    return cv_error_sq, array_error
diff --git a/utils/dataset_collection.py b/utils/dataset_collection.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a1fab991874b512e135ae841e90d794d0f1361
--- /dev/null
+++ b/utils/dataset_collection.py
@@ -0,0 +1,291 @@
+import numpy as np
+import pandas as pd
+import os
+import h5py
+import math
+import scipy.io as spio
+from P32_just_interpolate.utilities.utils import load_mnist_pair, get_basepath
+import utilities
+
+class Dataset():
+    """
+    Implementation of several toy and real world data sets
+    """
+
+    def __init__(self, N_points=10000):
+
+        self.N_points = N_points
+        self.X = None
+        self.y = None
+
+        self.dic_dataset = {
+            'example_2d': (lambda x: .02 * (x[:, [0]] + x[:, [1]]) ** 2 + np.sin(2 * math.pi * (x[:, [0]] - x[:, [1]])), 2),
+            'example_2d_tiz': (lambda x: x[:, [0]] + (x[:, [0]] + .1 * x[:, [1]]) ** 2, 2),
+            'example_2d_radial': (lambda x: np.exp(-4 * np.sum((x - .5 * np.ones_like(x)) ** 2, axis=1, keepdims=True)), 2),
+            'example_2d_active_1': (lambda x: np.abs(x[:, [0]] - 2*x[:, [1]]), 2),
+            'example_2d_franke': (lambda x: 0.75 * np.exp(-(9 * x[:, [0]] - 2) ** 2 / 4 - (9 * x[:, [1]]- 2) ** 2 / 4)
+                                            + 0.75 * np.exp(-(9 * x[:, [0]] + 1) ** 2 / 49 - (9 * x[:, [1]] + 1) / 10)
+                                            + 0.5 * np.exp(-(9 * x[:, [0]] - 7) ** 2 / 4 - (9 * x[:, [1]] - 3) ** 2 / 4)
+                                            - 0.2 * np.exp(-(9 * x[:, [0]] - 4) ** 2 - (9 * x[:, [1]] - 7) ** 2), 2),
+            'example_3d_active_1': (lambda x: x[:, [0]] + 2 * x[:, [1]], 3),
+            'example_3d_active_2': (lambda x: (x[:, [0]] + x[:, [1]]) ** 2 + np.sin(2 * math.pi * (x[:, [0]] - x[:, [1]])), 3),
+            'example_10d_active': (lambda x: x[:, [0]] * (x[:, [1]] - x[:, [2]]) ** 3 + 2 * np.sin(math.pi * (x[:, [1]] - x[:, [2]])) - np.exp(-2 * x[:, [2]]), 10),
+            'example_10d_vanishing': (lambda x: x @ ((np.arange(10) + 1.0) ** (-2)).reshape(-1, 1), 10),
+            'example_10d_radial': (lambda x: np.exp(-4 * np.sum((x[:, :5] - .5 * np.ones_like(x[:, :5])) ** 2, axis=1, keepdims=True)), 10),
+            'example_5d_radial': (lambda x: np.exp(-4 * np.sum((x[:, :] - .5 * np.ones_like(x[:, :])) ** 2, axis=1, keepdims=True)), 5),
+            'example_5d_active': (lambda x: (x[:, [1]] - x[:, [2]]), 5),
+            'example_5d_faster_conv': (lambda x: np.exp(-4 * np.sum(x[:, :] - .5 * np.ones_like(x[:, :]), axis=1, keepdims=True) ** 2), 5),
+            'example_6d_kink': (lambda x: (np.exp(-4 * np.sum((x[:, :] - .5 * np.ones_like(x[:, :])) ** 2, axis=1, keepdims=True)) + 2 * np.abs(x[:, [0]] - .5)), 6),
+            'example_7d_semiactive': (lambda x: np.exp(-4 * np.sum((x[:, :] - .5 * np.ones_like(x[:, :])) ** 2, axis=1, keepdims=True))
+                                      + np.exp(-9 * np.sum((x[:, :2] - .3 * np.ones_like(x[:, :2])) ** 2, axis=1, keepdims=True)), 7),
+            # 'example_winkle': (lambda x: ((np.sin(x[:, [0]]) * np.exp(x[:, [1]]) + x[:, [2]]) ** ((x[:, [3]] + 1) ** 2))
+            #                              / ((x[:, [3]] + 1) ** 2), 5),
+        }
+
+
+    def get_data(self, name_dataset):
+
+        X, y = None, None
+
+        if name_dataset in self.dic_dataset.keys():
+            fcn, dim = self.dic_dataset[name_dataset]
+
+            X = np.random.rand(self.N_points, dim)
+            y = fcn(X)
+
+        elif name_dataset == 'example_2d_clusters':
+            X, y = self.example_2d_clusters()
+        elif name_dataset[:24] == 'example_highdim_clusters':
+            dim = int(name_dataset.split('_')[-1])
+            X, y = self.example_highdim_clusters(dim)
+        elif name_dataset[:10] == 'toyexample':
+            X, y = self.toyexample(name_dataset)
+        else:
+            if name_dataset[:13] == 'example_mnist':
+                X, y = self.example_mnist(name_dataset)
+            elif name_dataset == 'example_TUD':
+                X, y = self.example_TUD()
+            elif name_dataset in ['ct', 'diamonds', 'fried', 'kegg_undir_uci',
+                                  'methane', 'mlr_knn_rng', 'online_video', 'poker',
+                                  'protein', 'query_agg_count', 'road_network', 'sarcos', 'sgemm',
+                                  'stock', 'wecs']:
+                X, y = self.example_holzmuller(name_dataset)
+            elif name_dataset in ['flights', 'higgs', 'susy', 'taxi', 'timit']:
+                X, y = self.falkon_datasets(name_dataset)
+            elif name_dataset in ['uci_airfoil_self_noise', 'uci_CCPP']:
+                X, y = self.example_uci(name_dataset)
+
+        assert X is not None, 'Bug in get_data!'
+        assert y is not None, 'Bug in get_data!'
+
+        return X, y
+
+    def example_2d_clusters(self):
+
+        X1 = np.random.randn(100, 2)
+        X2 = .1 * np.random.randn(500, 2) + np.array([.5, .5])
+        X3 = .2 * np.random.randn(500, 2) + np.array([-.5, -.5])
+
+        f_func = lambda x: 1 / (1 + np.abs(x[:, 0] - .5))
+
+        X = np.concatenate([X1, X2, X3], axis=0)
+        y = f_func(X)
+
+        return X, y
+
+    def example_highdim_clusters(self, dim):
+        # In the meeting with Antoine and Giacomo we cam up with a hypothesis when greedy works better.
+
+        # X1 = np.random.rand(1000, dim)
+        # X2 = .1 * np.random.rand(5000, dim) + .5 * np.zeros((1, dim))
+        # X3 = .2 * np.random.rand(5000, dim) - .5 * np.zeros((1, dim))
+        # X = np.concatenate([X1, X2, X3], axis=0)
+        #
+        # y = np.ones((X.shape[0], 1))
+        # y[X[:, 0] > .5] = -1
+
+        N_points = self.N_points
+
+        # Example 1: Peak at center, then decay. randn distribution.
+        # X = np.random.randn(N_points, dim)
+        # center = .5 * np.ones((1, dim))
+        # y = 1 / (1 + 5 * np.linalg.norm(X - center, axis=1, keepdims=True))
+        # y = y + 1e-3 * np.random.randn(y.shape[0], 1)
+
+
+        # Example2: Peak at center, then decay. sparse randn distribution and two clusters.
+        X1 = np.random.randn(int(.1*N_points), dim)
+        X2 = .1 * np.random.rand(int(.45*N_points), dim) + .5 * np.ones((1, dim))
+        X3 = .2 * np.random.rand(int(.45*N_points), dim) - .5 * np.ones((1, dim))
+        X = np.concatenate([X1, X2, X3], axis=0)
+
+        center = .5 * np.ones((1, dim))
+        y = 1 / (1 + 5 * np.linalg.norm(X - center, axis=1, keepdims=True))
+        y = y + 1e-3 * np.random.randn(y.shape[0], 1)
+
+        return X, y
+
+
+    def example_mnist(self, name_dataset):
+        """MNIST dataset"""
+
+        X_train, X_test, y_train, y_test = \
+            load_mnist_pair(get_basepath(),
+                            [int(name_dataset[-2]), int(name_dataset[-1])])
+
+        X = np.concatenate([X_train, X_test], axis=0)
+        y = np.concatenate([y_train, y_test], axis=0).reshape(-1, 1)
+
+        return X, y
+
+    def example_uci(self, name_dataset):
+        """Some UCI datasets"""
+
+        # path = '/usr/local/home/wenzeltn/deepkernel-pytorch2/data/UCI/'
+        path = '/home/wenzeltn/local_home/deepkernel-pytorch2/data/UCI/'
+
+        if name_dataset == 'uci_airfoil_self_noise':
+            data = np.loadtxt(path + 'airfoil_self_noise.dat', unpack=True).transpose()
+
+            X = data[:, :5]
+            y = data[:, [5]]
+
+            X = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-30)
+            y = (y - y.mean(axis=0, keepdims=True)) / (y.std(axis=0, keepdims=True) + 1e-30)
+
+        if name_dataset == 'uci_CCPP':
+            import pandas as pd
+            data = pd.read_excel(path + 'CCPP/' + 'Folds5x2_pp.xlsx').to_numpy()
+
+            X = data[:, :4]
+            y = data[:, [4]]
+
+            X = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-30)
+            y = (y - y.mean(axis=0, keepdims=True)) / (y.std(axis=0, keepdims=True) + 1e-30)
+
+        return X, y
+
+    def example_TUD(self):
+        """Felix Döppel data"""
+
+        path = '/home/wenzeltn/local_home/deepkernel-pytorch2/P01_DK_optimization/data/'
+        # path = '/usr/local/home/wenzeltn/deepkernel-pytorch2/P01_DK_optimization/data/'
+        mat_train = spio.loadmat(path + 'DataTUDKinetics.mat', squeeze_me=True)
+        # mat_test = spio.loadmat(path + 'TestTUDKinetics.mat', squeeze_me=True)
+
+        X = mat_train['input']
+        y = mat_train['output'].reshape(-1, 1)
+
+        return X, y
+
+    def example_holzmuller(self, ds_name):
+        """Data set from David Holzmüller paper of batch active learning."""
+
+        if os.path.exists('/usr/local/home/wenzeltn'):   # anm03
+            path = '/usr/local/home/wenzeltn/deepkernel-pytorch2/data/data_holzmuller/'
+        elif os.path.exists('/usr/local/storage/wenzeltn'):  # ianscluster
+            path = '/usr/local/storage/wenzeltn/deepkernel-pytorch2/data/data_holzmuller/'
+        elif os.path.exists('/home/wenzeltn/'):     # laptop
+            path = '/home/wenzeltn/local_home/deepkernel-pytorch2/data/data_holzmuller/'
+
+        X = np.load(path + ds_name + '/X.npy')
+        y = np.load(path + ds_name + '/y.npy')
+
+        # Standardization of input according to David H (reduce effect of outliers)
+        X = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-30)
+        X = 5 * np.tanh(0.2 * X)
+
+        y = (y - y.mean(axis=0, keepdims=True)) / (y.std(axis=0, keepdims=True) + 1e-30)
+
+        return X, y
+
+    def toyexample(self, ds_name):
+        """Toy datasets from Tizian created on 04.01.23."""
+
+        basepath = utilities.get_basepath()
+        path = basepath + 'data/TOY_selfcreated/'
+
+        X = np.load(path + ds_name + 'X.npy')
+        y = np.load(path + ds_name + 'y.npy')
+
+        # Standardization of input according to David H (reduce effect of outliers)
+        X = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-30)
+        X = 5 * np.tanh(0.2 * X)
+
+        y = (y - y.mean(axis=0, keepdims=True)) / (y.std(axis=0, keepdims=True) + 1e-30)
+
+        return X, y
+
+
+    def falkon_datasets(self, ds_name):
+        """Datasets from FALKON paper."""
+
+        path = None
+        if os.path.exists('/usr/local/home/wenzeltn'):   # anm03
+            path = '/usr/local/home/wenzeltn/deepkernel-pytorch2/data/FALKON_data/'
+        elif os.path.exists('/usr/local/storage/wenzeltn'):  # ianscluster
+            path = '/usr/local/storage/wenzeltn/deepkernel-pytorch2/data/FALKON_data/'
+        elif os.path.exists('/home/wenzeltn/'):
+            pass
+
+        filename = None
+        if ds_name == 'susy' or ds_name == 'higgs':
+            if ds_name == 'susy':
+                filename = 'Susy.mat'
+            else:
+                filename = 'Higgs.mat'
+
+            with h5py.File(path + filename, "r") as h5py_file:
+                arr = np.asarray(h5py_file['X'], dtype=np.float32).T
+            X = arr[:, 1:]
+            y = arr[:, 0].reshape(-1, 1)
+
+            # Preprocess input
+            mtr = np.mean(X, axis=0, dtype=np.float64, keepdims=True).astype(X.dtype)
+            vtr = np.var(X, axis=0, dtype=np.float64, ddof=1, keepdims=True).astype(X.dtype)
+
+            X -= mtr
+            X /= vtr
+
+            # Preprocess outputs
+            y = y * 2 - 1
+
+
+        elif ds_name == 'timit':    # num_train_samples = 1124823
+            pass
+            # ToDo: Does not work, dataset probably damaged
+
+            # filename = 'TIMIT.mat'
+            #
+            # with h5py.File(path + filename, 'r') as h5py_file:
+            #     dtype = np.float32
+            #     Xtr = np.array(h5py_file['Xtr'], dtype=dtype)
+            #     Xts = np.array(h5py_file['Xts'], dtype=dtype)
+            #     Ytr = np.array(h5py_file['Ytr'], dtype=dtype).reshape((-1,))
+            #     Yts = np.array(h5py_file['Yts'], dtype=dtype).reshape((-1,))
+            #     X = np.concatenate((Xtr, Xts), axis=0)
+            #     y = np.concatenate((Ytr, Yts), axis=0)
+            #
+            #
+            # #
+            # # f = spio.loadmat(path + filename)
+            # # dtype = np.float32
+            # # Xtr = np.array(f['Xtr'], dtype=dtype)
+            # # Xts = np.array(f['Xts'], dtype=dtype)
+            # # Ytr = np.array(f['Ytr'], dtype=dtype).reshape((-1,))
+            # # Yts = np.array(f['Yts'], dtype=dtype).reshape((-1,))
+            # # X = np.concatenate((Xtr, Xts), axis=0)
+            # # y = np.concatenate((Ytr, Yts), axis=0)
+
+        # elif ds_name == 'flights':
+        #     filename = 'flights.csv'
+        #     # Not implemented, because I have a .csv file, Giacomo uses an .hdf5 file
+        #
+        # elif ds_name == 'taxi':
+        #     pass
+        #     # Not implemented because super large
+
+        # ToDo: Not implemented: Taxi since very large, flights only in .csv, timit is broken
+
+        return X, y
+
diff --git a/utils/hyperparameters.py b/utils/hyperparameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..02469eea921fc1cbb321ea65b2946e26ffc01f2e
--- /dev/null
+++ b/utils/hyperparameters.py
@@ -0,0 +1,163 @@
+
+class example_func_approx():
+    # No noise will be added, because we truly want to optimize to a function.
+
+    maxIter_vkoga = 250
+    N_points = 50000
+    noise_level = 0
+
+    reg_para_optim = 1e-5       # for kernel optimization
+    reg_para_vkoga = 0          # for running VKOGA
+    learning_rate = 5e-3
+    n_epochs = 25
+    batch_size = 64
+    n_folds = None
+
+    flag_initialize_diagonal = True
+    flag_symmetric = False
+    flag_gaussian = False
+
+    n_cross_val = 10
+    shape_para = 1
+    k_matern = 0
+
+
+class example_2d(example_func_approx):
+    pass
+
+class example_10d_vanishing(example_func_approx):
+    flag_gaussian = True
+
+# class example_toy():
+#     maxIter_vkoga = 250
+#     N_points = 10000
+#     noise_level = 1e-3
+#
+#     reg_para_optim = 1e-5       # for kernel optimization
+#     reg_para_vkoga = 0          # for running VKOGA
+#     learning_rate = 5e-3
+#     n_epochs = 25
+#     batch_size = 64
+#     n_folds = None
+#
+#     flag_initialize_diagonal = True
+#     flag_symmetric = False
+#     flag_gaussian = False
+#
+#     n_cross_val = 10
+#     shape_para = 1
+#     k_matern = 0
+
+
+class example_TUD():
+    maxIter_vkoga = 500
+    N_points = None
+    noise_level = 0
+
+    reg_para_optim = 1e-5       # for kernel optimization
+    reg_para_vkoga = 0          # for running VKOGA
+    learning_rate = 5e-3
+    n_epochs = 50
+    batch_size = 64
+    n_folds = None
+
+    flag_initialize_diagonal = True
+    flag_symmetric = False
+    flag_gaussian = True
+
+    n_cross_val = 10
+    shape_para = 1
+    k_matern = 0
+
+
+class example_holzmuller():
+    maxIter_vkoga = 1000
+    N_points = None
+    noise_level = 0
+
+    reg_para_optim = 1e-3       # for kernel optimization
+    reg_para_vkoga = 1e-4          # for running VKOGA
+    learning_rate = 5e-3
+    n_epochs = 25
+    batch_size = 64
+    n_folds = None
+
+    flag_initialize_diagonal = True
+    flag_symmetric = False
+    flag_gaussian = False
+
+    n_cross_val = 10
+    shape_para = 1 / 5
+    k_matern = 0
+
+class example_holzmuller_few_epochs(example_holzmuller):
+    n_epochs = 10
+
+
+
+class example_mnist(example_holzmuller):
+    n_epochs = 5
+    maxIter_vkoga = 200
+
+class example_airfoil(example_holzmuller):
+    n_epochs = 25
+    learning_rate = 2e-2
+    maxIter_vkoga = 500
+    k_matern = 0
+
+class example_CCPP(example_holzmuller):
+    n_epochs = 25
+    learning_rate = 2e-2
+    maxIter_vkoga = 500
+    k_matern = 0
+
+
+
+dic_hyperparams = {
+    'example_2d': example_func_approx(),
+    'example_2d_tiz': example_func_approx(),
+    'example_2d_radial': example_func_approx(),
+    'example_2d_active_1': example_func_approx(),
+    'example_2d_franke': example_func_approx(),
+    'example_3d_active_1': example_func_approx(),
+    'example_3d_active_2': example_func_approx(),
+    'example_10d_active': example_func_approx(),
+    'example_10d_vanishing': example_func_approx(),
+    'example_10d_radial': example_func_approx(),
+    'example_5d_radial': example_func_approx(),
+    'example_5d_active': example_func_approx(),
+    'example_5d_faster_conv': example_func_approx(),
+    'example_6d_kink': example_func_approx(),
+    'example_7d_semiactive': example_func_approx(),
+    'example_winkle': example_func_approx(),
+    'example_TUD': example_TUD(),
+
+
+    'ct': example_holzmuller_few_epochs(),
+    'diamonds': example_holzmuller_few_epochs(),
+    'fried': example_holzmuller_few_epochs(),
+    'kegg_undir_uci': example_holzmuller_few_epochs(),
+    'methane': example_holzmuller_few_epochs(),
+    'mlr_knn_rng': example_holzmuller_few_epochs(),
+    'online_video': example_holzmuller_few_epochs(),
+    'poker': example_holzmuller_few_epochs(),
+    'protein': example_holzmuller_few_epochs(),
+    'query_agg_count': example_holzmuller_few_epochs(),
+    'road_network': example_holzmuller_few_epochs(),
+    'sarcos': example_holzmuller_few_epochs(),
+    'sgemm': example_holzmuller_few_epochs(),
+    'stock': example_holzmuller_few_epochs(),
+    'wecs': example_holzmuller_few_epochs(),
+
+    'susy': example_holzmuller_few_epochs(),
+    'higgs': example_holzmuller_few_epochs(),
+
+
+    'example_mnist_01': example_mnist(),
+    'example_mnist_34': example_mnist(),
+
+    'uci_airfoil_self_noise': example_airfoil(),
+    'uci_CCPP': example_CCPP(),
+
+}
+
diff --git a/utils/kernels.py b/utils/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d223479acc1118563f9635d7d15a996605645f6
--- /dev/null
+++ b/utils/kernels.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+
+from abc import ABC, abstractmethod
+from scipy.spatial import distance_matrix
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Abstract kernel
+class Kernel(ABC):
+    @abstractmethod    
+    def __init__(self):
+        super().__init__()
+    
+    @abstractmethod
+    def eval(self, x, y):
+        pass
+
+    def eval_prod(self, x, y, v, batch_size=100):
+        N = x.shape[0]
+        num_batches = int(np.ceil(N / batch_size))
+        mat_vec_prod = np.zeros((N, 1)) 
+        for idx in range(num_batches):
+            idx_begin = idx * batch_size
+            idx_end = (idx + 1) * batch_size
+            A = self.eval(x[idx_begin:idx_end, :], y)
+            mat_vec_prod[idx_begin:idx_end] = A @ v
+        return mat_vec_prod
+
+    @abstractmethod
+    def diagonal(self, X):
+        pass
+
+    @abstractmethod
+    def __str__(self):
+        pass
+
+    @abstractmethod
+    def set_params(self, params):
+        pass
+
+# Abstract RBF
+class RBF(Kernel):
+    @abstractmethod    
+    def __init__(self):
+        super(RBF, self).__init__()
+        
+    def eval(self, x, y):
+        return self.rbf(self.ep, distance_matrix(np.atleast_2d(x), np.atleast_2d(y)))
+
+    def diagonal(self, X):
+        return np.ones(X.shape[0]) * self.rbf(self.ep, 0)
+    
+    def __str__(self):
+     return self.name + ' [gamma = %2.2e]' % self.ep   
+
+    def set_params(self, par):
+        self.ep = par
+
+# Implementation of concrete RBFs
+class Gaussian(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'gauss'
+        self.rbf = lambda ep, r: np.exp(-(ep * r) ** 2)
+
+class GaussianTanh(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'gauss_tanh'
+        self.rbf = lambda ep, r: np.exp(-(ep * np.tanh(r)) ** 2)
+
+class IMQ(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'imq'
+        self.rbf = lambda ep, r: 1. / np.sqrt(1 + (ep * r) ** 2)
+
+class Matern(RBF):
+    def __init__(self, ep=1, k=0):
+        self.ep = ep
+        if k == 0:
+            self.name = 'mat0'
+            self.rbf = lambda ep, r : np.exp(-ep * r)
+        elif k == -1:
+            self.name = 'derivative kernel ob quadratic matern'
+            self.rbf = lambda ep, r: np.exp(-r) * (r**2 - (2 * 1 + 3) * r + 1 ** 2 + 2 * 1)
+        elif k == 1:
+            self.name = 'mat1'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (1 + ep * r)
+        elif k == 2:
+            self.name = 'mat2'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (3 + 3 * ep * r + 1 * (ep * r) ** 2)
+        elif k == 3:
+            self.name = 'mat3'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (15 + 15 * ep * r + 6 * (ep * r) ** 2 + 1 * (ep * r) ** 3)
+        elif k == 4:
+            self.name = 'mat4'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (105 + 105 * ep * r + 45 * (ep * r) ** 2 + 10 * (ep * r) ** 3 + 1 * (ep * r) ** 4)
+        elif k == 5:
+            self.name = 'mat5'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (945 + 945 * ep * r + 420 * (ep * r) ** 2 + 105 * (ep * r) ** 3 + 15 * (ep * r) ** 4 + 1 * (ep * r) ** 5)
+        elif k == 6:
+            self.name = 'mat6'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (10395 + 10395 * ep * r + 4725 * (ep * r) ** 2 + 1260 * (ep * r) ** 3 + 210 * (ep * r) ** 4 + 21 * (ep * r) ** 5 + 1 * (ep * r) ** 6)
+        elif k == 7:
+            self.name = 'mat7'
+            self.rbf = lambda ep, r: np.exp(-ep * r) * (135135 + 135135 * ep * r + 62370 * (ep * r) ** 2 + 17325 * (ep * r) ** 3 + 3150 * (ep * r) ** 4 + 378 * (ep * r) ** 5 + 28 * (ep * r) ** 6 + 1 * (ep * r) ** 7)
+        else:
+            self.name = None
+            self.rbf = None
+            raise Exception('This Matern kernel is not implemented')
+
+class Wendland(RBF):
+    def __init__(self, ep=1, k=0, d=1):
+        self.ep = ep
+        self.name = 'wen_' + str(d) + '_' + str(k)
+        l = np.floor(d / 2) + k + 1
+        if k == 0:
+            p = lambda r: 1
+        elif k == 1:
+            p = lambda r: (l + 1) * r + 1
+        elif k == 2:
+            p = lambda r: (l + 3) * (l + 1) * r ** 2 + 3 * (l + 2) * r + 3
+        elif k == 3:
+            p = lambda r: (l + 5) * (l + 3) * (l + 1) * r ** 3 + (45 + 6 * l * (l + 6)) * r ** 2 + (15 * (l + 3)) * r + 15
+        elif k == 4:
+            p = lambda r: (l + 7) * (l + 5) * (l + 3) * (l + 1) * r ** 4 + (5 * (l + 4) * (21 + 2 * l * (8 + l))) * r ** 3 + (45 * (14 + l * (l + 8))) * r ** 2 + (105 * (l + 4)) * r + 105
+        else:
+            raise Exception('This Wendland kernel is not implemented')
+        c = np.math.factorial(l + 2 * k) / np.math.factorial(l)
+        e = l + k
+        self.rbf = lambda ep, r: np.maximum(1 - ep * r, 0) ** e * p(ep * r) / c
+    
+
+ # Polynomial kernels    
+class Polynomial(Kernel):
+    def __init__(self, a=0, p=1):
+        self.a = a
+        self.p = p
+            
+    def eval(self, x, y):
+        return (np.atleast_2d(x) @ np.atleast_2d(y).transpose() + self.a) ** self.p
+    
+    def diagonal(self, X):
+        return ((np.linalg.norm(X, axis=1)**2 + self.a) ** self.p) #[:, None]
+
+    def __str__(self):
+     return 'polynomial' + ' [a = %2.2e, p = %2.2e]' % (self.a, self.p)   
+
+    def set_params(self, par):
+        self.a = par[0]
+        self.p = par[1]
+
+
+# Polynomial kernels
+class BrownianBridge(Kernel):
+    def __init__(self):
+        super().__init__()
+        self.name = 'Brownian Bridge'
+
+    def eval(self, x, y):
+        return np.minimum(np.atleast_2d(x), np.transpose(np.atleast_2d(y))) - np.atleast_2d(x) * np.transpose(np.atleast_2d(y))
+
+    def diagonal(self, X):
+        return X[:, 0] - X[:, 0] ** 2
+
+    def __str__(self):
+        return 'Brownian Bridge kernel'
+
+    def set_params(self, par):
+        pass
+
+class BrownianMotion(Kernel):
+    def __init__(self):
+        super().__init__()
+        self.name = 'Brownian Motion'
+
+    def eval(self, x, y):
+        return np.minimum(np.atleast_2d(x), np.transpose(np.atleast_2d(y)))
+
+    def diagonal(self, X):
+
+
+        return X.reshape(-1)
+
+    def __str__(self):
+        return 'Brownian Motion kernel'
+
+    def set_params(self, par):
+        pass
+
+# Convolution kernel for Wendland k=0 kernel max(0, 1-|x-y|) on [0,1]. For calculation see 11.02.23\A1.
+class ConvKernelWendland(Kernel):
+    def __init__(self):
+        super().__init__()
+        self.name = 'Convolution Kernel Wendland k=0'
+
+        self.func_util = lambda x, y: 1/3 * (-x**3 + 3*x**2*y - 3*x**2 - 3*x*y**2 + 3*x*y + 3/2*x + y**3 - 3*y**2 + 3/2*y + 1)
+
+    def eval(self, x, y):
+        x = np.atleast_2d(x)
+        y = np.atleast_2d(y)
+
+        A1 = self.func_util(x, y.transpose())
+        A2 = self.func_util(y, x.transpose()).transpose()
+
+        # Calculate matrix of logicals to check where x < y
+        diff = x - y.transpose()
+        M = diff <= 0
+
+        return A1 * M + A2 * ~M
+
+    def diagonal(self, X):
+        return 1/3 + X[:, 0] - X[:, 0] ** 2
+
+    def __str__(self):
+        return 'Convolution Kernel Wendland k=0'
+
+    def set_params(self, par):
+        pass
+
+
+# Tensor product kernels
+class TensorProductKernel(Kernel):
+    def __init__(self, kernel):
+        super().__init__()
+
+        self.kernel_1D = kernel
+        self.name = self.kernel_1D.name
+
+    def eval(self, x, y):
+
+        x = np.atleast_2d(x)
+        y = np.atleast_2d(y)
+
+        assert x.shape[1] == y.shape[1], 'Dimension do not match'
+
+        array_matrix = np.ones((x.shape[0], y.shape[0]))
+
+        for idx_dim in range(x.shape[1]):
+            array_matrix = array_matrix * self.kernel_1D.eval(x[:, [idx_dim]], y[:, [idx_dim]])
+
+        return array_matrix
+
+    def diagonal(self, X):
+
+        X = np.atleast_2d(X)
+
+        array_diagonal = np.ones(X.shape[0])
+
+        for idx_dim in range(X.shape[1]):
+            array_diagonal *= self.kernel_1D.diagonal(X[:, [idx_dim]])
+
+        return array_diagonal
+
+    def __str__(self):
+        return 'Tensor product kernel for ' + self.name
+
+    def set_params(self, par):
+        pass
+
+# A demo usage
+def main():
+    ker = Gaussian()
+
+    x = np.linspace(-1, 1, 100)[:, None]
+    y = np.matrix([0])
+    A = ker.eval(x, y)
+
+
+    fig = plt.figure(1)
+    fig.clf()
+    ax = fig.gca()
+    ax.plot(x, A)
+    ax.set_title('A kernel: ' + str(ker))
+    fig.show()
+
+
+if __name__ == '__main__':
+    main()
+
+
+        
diff --git a/utils/main_function.py b/utils/main_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea871ee2a46929fee066d0d6c43bba30f3d75333
--- /dev/null
+++ b/utils/main_function.py
@@ -0,0 +1,285 @@
+
+
+from P36_Francesco_Emma.models.optimized_kernel import OptimizedKernel
+from P36_Francesco_Emma.utilities.dataset_collection import Dataset
+from P36_Francesco_Emma.utilities.hyperparameters import dic_hyperparams
+
+from vkoga import tkernels, kernels
+import torch
+from matplotlib import pyplot as plt
+import numpy as np
+import time
+
+from sklearn.model_selection import train_test_split
+from vkoga.kernels import Matern
+from vkoga.vkoga import VKOGA
+import os
+
+from datetime import datetime
+import utilities
+
+
+## Some settings
+list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
+
+basepath = utilities.get_basepath()
+path_for_indices = basepath + 'data/'
+
+
+# Main functiono to run kernel optimization with subsequent vkoga
+def run_everything(name_dataset, maxIter_vkoga, N_points, noise_level, reg_para_optim, reg_para_vkoga,
+                   learning_rate, n_epochs, batch_size, n_folds,
+                   flag_initialize_diagonal=False, flag_symmetric_A=False, flag_gaussian=True,
+                   flag_vkoga_verbose=False, flag_plots=False, flag_optim_verbose=True,
+                   flag_std_vkoga=True, idx_rerun=None):
+
+    dic_timings = {}
+
+    ## Load dataset
+    dataset = Dataset(N_points=N_points)
+    X, y = dataset.get_data(name_dataset)
+    hyperparameter = dic_hyperparams[name_dataset]
+
+    # Preprocessing
+    if 'example' in name_dataset:
+        idx = np.arange(X.shape[0])
+    else:
+        assert idx_rerun is not None, 'idx_rerun is not set!'
+        idx = np.load(path_for_indices + '_indices_{}/indices_'.format(idx_rerun) + name_dataset + '.npy')
+
+    n_train = int(.8 * X.shape[0])
+
+    X_train = X[idx[:n_train]]
+    y_train = y[idx[:n_train]]
+    X_test = X[idx[n_train:]]
+    y_test = y[idx[n_train:]]
+    X_train_torch, y_train_torch = torch.from_numpy(X_train).type(torch.float), torch.from_numpy(y_train).type(torch.float)
+    # ToDo: noise level removed! (was set to 0 so far!)
+
+
+    ## Select kernel
+    if flag_gaussian:
+        kernel_t = tkernels.Gaussian(ep=hyperparameter.shape_para / np.sqrt(X_train.shape[1]))
+        kernel = kernels.Gaussian(ep=hyperparameter.shape_para /np.sqrt(X_train.shape[1]))
+    else:
+        kernel_t = tkernels.Matern(k=hyperparameter.k_matern, ep=hyperparameter.shape_para / np.sqrt(X_train.shape[1]))
+        kernel = kernels.Matern(k=hyperparameter.k_matern, ep=hyperparameter.shape_para / np.sqrt(X_train.shape[1]))
+
+
+
+    ## Optimize kernel model
+    model = OptimizedKernel(kernel=kernel_t, dim=X_train_torch.shape[1],
+                            reg_para=reg_para_optim,
+                            learning_rate=learning_rate,
+                            n_epochs=n_epochs,
+                            batch_size=batch_size,
+                            n_folds=n_folds,
+                            flag_initialize_diagonal=flag_initialize_diagonal,
+                            flag_symmetric_A=flag_symmetric_A)
+
+    A_start = torch.clone(model.A).detach().numpy()
+
+    t_optim_start = time.time()
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, '2layered kernel optimization started.')
+    model.optimize(X_train_torch, y_train_torch, flag_optim_verbose=flag_optim_verbose)
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, '2layered kernel optimization finished.')
+    t_optim_stop = time.time()
+
+    dic_timings['2L_optim'] = t_optim_stop - t_optim_start
+
+    ## Application of VKOGA
+    # VKOGA with modified modified kernel, simply pre-apply the linear transformation
+    model_vkoga1 = VKOGA(kernel=kernel, greedy_type='f_greedy',
+                         verbose=flag_vkoga_verbose, reg_par=reg_para_vkoga)
+    t_vkoga1_start = time.time()
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, '2layered VKOGA started.')
+    _ = model_vkoga1.fit(X_train @ model.A.detach().numpy(), y_train,
+                         X_val=X_test @ model.A.detach().numpy(), y_val=y_test,
+                         maxIter=maxIter_vkoga)
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, '2layered VKOGA finished.')
+    t_vkoga1_stop = time.time()
+
+    dic_timings['2L_vkoga'] = t_vkoga1_stop - t_vkoga1_start
+
+    # VKOGA with standard kernel
+    model_vkoga2 = VKOGA(kernel=kernel, greedy_type='f_greedy',
+                         verbose=flag_vkoga_verbose, reg_par=reg_para_vkoga)
+    t_vkoga2_start = time.time()
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Standard VKOGA started.')
+    if flag_std_vkoga:
+        _ = model_vkoga2.fit(X_train, y_train,
+                             X_val=X_test, y_val=y_test,
+                             maxIter=maxIter_vkoga)
+        print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Standard VKOGA finished.')
+    else:
+        print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Standard VKOGA was skipped.')
+
+    t_vkoga2_stop = time.time()
+
+
+
+    ## Some analysis: Print timings and error
+    mse_test_error_vkoga1 = np.linalg.norm(model_vkoga1.predict(X_test @ model.A.detach().numpy()) - y_test) ** 2  / y_test.shape[0]
+    if flag_std_vkoga:
+        mse_test_error_vkoga2 = np.linalg.norm(model_vkoga2.predict(X_test) - y_test) ** 2 / y_test.shape[0]
+    else:
+        mse_test_error_vkoga2 = np.linalg.norm(y_test) ** 2 / y_test.shape[0]
+
+
+    print(name_dataset, 'Deep Kernel Optimization     {:.3f}s.'.format(t_optim_stop - t_optim_start))
+    print(name_dataset, 'Deep VKOGA runtime:          {:.3f}s.'.format(t_vkoga1_stop - t_vkoga1_start))
+    print(name_dataset, 'Standard VKOGA runtime:      {:.3f}s.'.format(t_vkoga2_stop - t_vkoga2_start))
+    print(' -----------------------------------------')
+    print(name_dataset, 'Deep kernel test MSE:        {:.3e}'.format(mse_test_error_vkoga1))
+    if flag_std_vkoga:
+        print(name_dataset, 'Standard kernel test MSE:    {:.3e}'.format(mse_test_error_vkoga2))
+    else:
+        print(name_dataset, 'Standard kernel test MSE:    skipped.')
+    print(' ')
+
+    ## Compute (intermediate) test rmse
+    list_nctrs_rmse = list(np.array(list_nctrs)[np.array(list_nctrs) <= maxIter_vkoga])
+    array_test_rmse = np.zeros((1, len(list_nctrs_rmse)))
+
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Started computation of test errors.')
+    for idx_nctrs, nctrs in enumerate(list_nctrs_rmse): # centers need not to be transformed by matrix A !!
+        y_test_pred = kernel.eval(X_test @ model.A.detach().numpy(), model_vkoga1.ctrs_[:nctrs]) \
+                      @ model_vkoga1.Cut_[:nctrs, :nctrs].transpose() @ model_vkoga1.c[:nctrs]
+
+        array_test_rmse[0, idx_nctrs] = \
+            np.linalg.norm(y_test - y_test_pred) ** 2 / y_test.shape[0]
+    print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Finished computation of test errors.')
+
+
+    ## Some visualizations
+    array_concatenate = np.stack(model.list_parameters, 2)
+    if flag_plots:
+        # Change of parameters of linear layer
+        n_epochs = array_concatenate.shape[-1]
+
+        idx_figure_offset = 0
+        plt.figure(idx_figure_offset + 5)
+        plt.clf()
+        list_legend = []
+        for k in range(array_concatenate.shape[0]):
+            plt.plot(np.arange(1, n_epochs + 1), array_concatenate[k, k, :])
+            list_legend.append('A[{},{}]'.format(k, k))
+        plt.plot(np.arange(1, n_epochs + 1), array_concatenate[0, 1, :], '--')
+        plt.plot(np.arange(1, n_epochs + 1), array_concatenate[1, 0, :], '--')
+        plt.legend(list_legend + ['A[0,1]', 'A[1,0]'])
+        plt.title('matrix entries')
+        plt.xscale('log')
+        plt.xlabel('iterations')
+        plt.draw()
+
+        # Decay of optimization objective
+        plt.figure(idx_figure_offset + 1)
+        plt.clf()
+        plt.plot(np.arange(1, len(model.list_obj) + 1), model.list_obj)
+        plt.xscale('log')
+        plt.yscale('log')
+
+
+        # Decay of vkoga quantities
+        plt.figure(idx_figure_offset + 2)
+        plt.clf()
+        plt.plot(np.arange(1, len(model_vkoga1.train_hist['f']) + 1), model_vkoga1.train_hist['f'], 'm')
+        plt.plot(np.arange(1, len(model_vkoga2.train_hist['f']) + 1), model_vkoga2.train_hist['f'], 'b')
+        plt.plot(np.arange(1, len(model_vkoga1.train_hist['f val']) + 1), model_vkoga1.train_hist['f val'], 'm--')
+        plt.plot(np.arange(1, len(model_vkoga2.train_hist['f val']) + 1), model_vkoga2.train_hist['f val'], 'b--')
+        plt.plot(np.arange(1, len(model_vkoga1.train_hist['f val']) + 1), model_vkoga1.train_hist['f val'], 'm--')
+        plt.xscale('log')
+        plt.yscale('log')
+        plt.legend(['2 layer kernel', 'standard'])
+
+        # Plot of predictions
+        plt.figure(idx_figure_offset + 3)
+        plt.clf()
+        plt.plot(y_train, 'k')
+        plt.plot(model_vkoga1.predict(X_train @ model.A.detach().numpy()), 'm')
+        plt.plot(model_vkoga2.predict(X_train), 'b')
+        plt.legend(['true', 'deep kernel', 'standard kernel'])
+        plt.title('Train set predictions')
+
+        plt.figure(idx_figure_offset + 4)
+        plt.clf()
+        plt.plot(y_test, 'k')
+        plt.plot(model_vkoga1.predict(X_test @ model.A.detach().numpy()), 'm')
+        plt.plot(model_vkoga2.predict(X_test), 'b')
+        plt.legend(['true', 'deep kernel', 'standard kernel'])
+        plt.title('Test set predictions')
+
+        plt.figure(idx_figure_offset + 40)
+        plt.clf()
+        # plt.plot(y_test, 'k')
+        plt.plot(model_vkoga1.predict(X_test @ model.A.detach().numpy()).reshape(-1) - y_test.reshape(-1), 'm')
+        plt.plot(model_vkoga2.predict(X_test).reshape(-1) - y_test.reshape(-1), 'b')
+        plt.legend(['deep kernel', 'standard kernel'])
+        plt.title('Test set residuals')
+
+
+
+    data = [X_train, X_test, y_train, y_test]
+
+    return A_start, model.A.detach().numpy(), model, model_vkoga1, model_vkoga2, \
+           data, array_concatenate, array_test_rmse, dic_timings
+
+
+
+def run_cross_validation(name_dataset, data, maxIter_vkoga, flag_gaussian=True, n_cross_val=5,
+                         flag_vkoga_verbose=False):
+
+    # Unpack data, some settings
+    [X_train, X_test, y_train, y_test] = data
+    hyperparameter = dic_hyperparams[name_dataset]
+
+    lower_bound = .05
+    upper_bound = 10
+
+    array_eps = np.logspace(np.log(lower_bound) / np.log(10),
+                            np.log(upper_bound) / np.log(10), n_cross_val)
+    list_nctrs_rmse = list(np.array(list_nctrs)[np.array(list_nctrs) <= maxIter_vkoga])
+
+    array_test_rmse = np.zeros((n_cross_val, len(list_nctrs_rmse)))
+
+    # Prepare for computation
+    list_timings = []
+    array_f = np.zeros((maxIter_vkoga, n_cross_val))
+    array_f_val = np.zeros((maxIter_vkoga, n_cross_val))
+
+    for idx_para, para_ep in enumerate(array_eps):
+        # Select kernel
+        if flag_gaussian:
+            kernel = kernels.Gaussian(ep=para_ep * 1 / np.sqrt(X_train.shape[1]))
+        else:
+            kernel = kernels.Matern(k=hyperparameter.k_matern, ep=para_ep * 1 / np.sqrt(X_train.shape[1]))
+
+        # VKOGA with standard kernel
+        model_vkoga = VKOGA(kernel=kernel, greedy_type='f_greedy', verbose=flag_vkoga_verbose)
+        t_vkoga_start = time.time()
+        _ = model_vkoga.fit(X_train, y_train,
+                             X_val=X_test, y_val=y_test,
+                             maxIter=maxIter_vkoga)
+        t_vkoga_stop = time.time()
+
+
+        array_f[:, idx_para] = model_vkoga.train_hist['f']
+        array_f_val[:, idx_para] = model_vkoga.train_hist['f val']
+
+
+        list_timings.append(t_vkoga_stop - t_vkoga_start)
+
+        print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'kernel_par = {:.2e} run took {:.3f}s.'.format(para_ep, t_vkoga_stop - t_vkoga_start))
+
+        ## Compute (intermediate) test rmse
+        print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Started computation of test errors.')
+        for idx_nctrs, nctrs in enumerate(list_nctrs_rmse):
+            y_test_pred = kernel.eval(X_test, model_vkoga.ctrs_[:nctrs]) \
+                          @ model_vkoga.Cut_[:nctrs, :nctrs].transpose() @ model_vkoga.c[:nctrs]
+
+            array_test_rmse[idx_para, idx_nctrs] = \
+                np.linalg.norm(y_test - y_test_pred)**2 / y_test.shape[0]
+        print(datetime.now().strftime("%H:%M:%S"), name_dataset, 'Finished computation of test errors.')
+
+    return array_eps, array_f, array_f_val, array_test_rmse, list_timings
+
diff --git a/utils/optimized_kernel.py b/utils/optimized_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..423faf4ed875e5e78c1e508ec15c36d58d26f56d
--- /dev/null
+++ b/utils/optimized_kernel.py
@@ -0,0 +1,117 @@
+from torch import nn
+import torch
+from P36_Francesco_Emma.utilities.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2
+import numpy as np
+
+
+
+
+class OptimizedKernel(torch.nn.Module):
+    '''
+    Class for optimizing a two-layered kernel, i.e. to optimize the first-layer matrix A.
+    '''
+
+    def __init__(self, kernel, dim,
+                 reg_para=1e-5, learning_rate=1e-3, n_epochs=100, batch_size=32, n_folds=None,
+                 flag_initialize_diagonal=False, flag_symmetric_A=False):
+        super().__init__()
+
+        # Some settings, mostly optimization related
+        self.kernel = kernel
+
+        self.dim = dim
+        self.reg_para = reg_para
+        self.learning_rate = learning_rate
+        self.n_epochs = n_epochs
+        self.batch_size = batch_size
+
+        self.flag_symmetric_A = flag_symmetric_A
+
+        # Define linear maps - hardcoded
+        if torch.is_tensor(flag_initialize_diagonal):
+            self.B = nn.Parameter(flag_initialize_diagonal, requires_grad=True)
+        elif flag_initialize_diagonal:
+            self.B = nn.Parameter(torch.eye(self.dim, self.dim), requires_grad=True)
+        else:
+            self.B = nn.Parameter(torch.rand(self.dim, self.dim), requires_grad=True)
+
+        if self.flag_symmetric_A:
+            self.A = (self.B + self.B.t()) / 2
+        else:
+            self.A = self.B
+
+
+        if n_folds is None:
+            self.n_folds = self.batch_size
+        else:
+            self.n_folds = n_folds
+
+
+        # Set optimizer and scheduler
+        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=50, gamma=.7)
+
+        # Initliaze lists from tracking
+        self.list_obj = []
+        self.list_parameters = []
+
+
+    def optimize(self, X, y, flag_optim_verbose=True):
+
+        assert X.shape[0] == y.shape[0], 'Data sizes do not match'
+        n_batches = X.shape[0] // self.batch_size
+
+        # Append initial parameters
+        if self.flag_symmetric_A:
+            self.list_parameters.append(torch.clone((self.B + self.B.t()) / 2).detach().numpy())
+        else:
+            self.list_parameters.append(torch.clone(self.A).detach().numpy())
+
+
+        for idx_epoch in range(self.n_epochs):
+            shuffle = np.random.permutation(X.shape[0])  # reshuffle the data set every epoch
+
+            list_obj_loc = []
+
+            for idx_batch in range(n_batches):
+
+                # Select minibatch from the data
+                ind = shuffle[idx_batch * self.batch_size : (idx_batch + 1) * self.batch_size]
+                Xb, yb = X[ind, :], y[ind, :]
+
+                # Compute kernel matrix for minibatch
+                kernel_matrix = self.kernel.eval(Xb @ self.A, Xb @ self.A)
+
+                # use cross validation loss via rippa to assess the error
+                optimization_objective, _ = compute_cv_loss_via_rippa_ext_2(
+                    kernel_matrix, yb, self.n_folds, self.reg_para)
+
+                # Keep track of optimization quantity within epoch
+                list_obj_loc.append(optimization_objective.detach().item())
+                if idx_epoch == 0 and flag_optim_verbose:
+                    print('First epoch: Iteration {:5d}: Training objective: {:.3e}'.format(
+                        idx_batch, optimization_objective.detach().item()))
+
+                # Do optimization stuff
+                optimization_objective.backward()
+                self.optimizer.step()  # do one optimization step
+                self.optimizer.zero_grad()  # set gradients to zero
+
+                if self.flag_symmetric_A:
+                    self.A = (self.B + self.B.t()) / 2
+                else:
+                    self.A = self.B
+
+            # Keep track of some quantities and print something
+            mean_obj = np.mean(list_obj_loc)
+
+            if flag_optim_verbose:
+                print('Epoch {:5d} finished, mean training objective: {:.3e}.'.format(
+                    idx_epoch + 1, mean_obj))
+
+            self.list_obj.append(mean_obj)
+
+            self.list_parameters.append(torch.clone(self.A).detach().numpy())
+
+
+
diff --git a/utils/tkernels.py b/utils/tkernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..26e26602f18e4819b4ba4ff29f0345e83b26bf95
--- /dev/null
+++ b/utils/tkernels.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+
+# Torch implementation of the kernels!!
+# import sys
+# sys.path.append('/usr/local/lib/python3.7/dist-packages/')
+import torch
+from abc import ABC, abstractmethod
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+# Abstract kernel
+class Kernel(ABC):
+    @abstractmethod
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    def eval(self):
+        pass
+
+    @abstractmethod
+    def diagonal(self, X):
+        pass
+
+    @abstractmethod
+    def __str__(self):
+        pass
+
+    @abstractmethod
+    def set_params(self, params):
+        pass
+
+
+# Abstract RBF
+class RBF(Kernel):
+    @abstractmethod
+    def __init__(self):
+        super(RBF, self).__init__()
+
+    def eval(self, x, y):
+        return self.rbf(self.ep,
+                        torch.cdist(x, y))
+
+    def diagonal(self, X):
+        return torch.ones(X.shape[0], 1) * self.rbf(self.ep, torch.tensor(0.0))
+
+    def __str__(self):
+        return self.name + ' [gamma = %2.2e]' % self.ep
+
+    def set_params(self, par):
+        self.ep = par
+
+
+# Implementation of concrete RBFs
+class Gaussian(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'gauss'
+        self.rbf = lambda ep, r: torch.exp(-(ep * r) ** 2)
+
+
+class GaussianTanh(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'gauss_tanh'
+        self.rbf = lambda ep, r: torch.exp(-(ep * torch.tanh(r)) ** 2)
+
+
+class IMQ(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.name = 'imq'
+        self.rbf = lambda ep, r: 1. / torch.sqrt(1 + (ep * r) ** 2)
+
+
+class Matern(RBF):
+    def __init__(self, ep=1, k=0):
+        self.ep = ep
+        if k == 0:
+            self.name = 'mat0'
+            self.rbf = lambda ep, r: torch.exp(-ep * r)
+        elif k == 1:
+            self.name = 'mat1'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (1 + ep * r)
+        elif k == 2:
+            self.name = 'mat2'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (3 + 3 * ep * r + (ep * r) ** 2)
+        elif k == 3:
+            self.name = 'mat3'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (15 + 15 * ep * r + 6 * (ep * r) ** 2 + 1 * (ep * r) ** 3)
+        elif k == 4:
+            self.name = 'mat4'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (
+                        105 + 105 * ep * r + 45 * (ep * r) ** 2 + 10 * (ep * r) ** 3 + 1 * (ep * r) ** 4)
+        elif k == 5:
+            self.name = 'mat5'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (
+                        945 + 945 * ep * r + 420 * (ep * r) ** 2 + 105 * (ep * r) ** 3 + 15 * (ep * r) ** 4 + 1 * (
+                            ep * r) ** 5)
+        elif k == 6:
+            self.name = 'mat6'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (
+                        10395 + 10395 * ep * r + 4725 * (ep * r) ** 2 + 1260 * (ep * r) ** 3 + 210 * (
+                            ep * r) ** 4 + 21 * (ep * r) ** 5 + 1 * (ep * r) ** 6)
+        elif k == 7:
+            self.name = 'mat7'
+            self.rbf = lambda ep, r: torch.exp(-ep * r) * (
+                        135135 + 135135 * ep * r + 62370 * (ep * r) ** 2 + 17325 * (ep * r) ** 3 + 3150 * (
+                            ep * r) ** 4 + 378 * (ep * r) ** 5 + 28 * (ep * r) ** 6 + 1 * (ep * r) ** 7)
+        else:
+            self.name = None
+            self.rbf = None
+            raise Exception('This Matern kernel is not implemented')
+
+
+class HatFunction(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.rbf = lambda ep, r: torch.clamp(1 - ep*r, min=0)
+        self.name = 'Hat function'
+
+
+class PowerKernels(RBF):
+    def __init__(self, ep=1):
+        self.ep = ep
+        self.rbf = lambda ep, r: torch.abs(r)
+        self.name = 'Power kernel'
+
+
+class Wendland(RBF):
+    def __init__(self, ep=1, k=0, d=1):
+        self.ep = ep
+        self.name = 'wen_' + str(d) + '_' + str(k)
+        l = np.floor(d / 2) + k + 1
+        if k == 0:
+            p = lambda r: 1
+        elif k == 1:
+            p = lambda r: (l + 1) * r + 1
+        elif k == 2:
+            p = lambda r: (l + 3) * (l + 1) * r ** 2 + 3 * (l + 2) * r + 3
+        elif k == 3:
+            p = lambda r: (l + 5) * (l + 3) * (l + 1) * r ** 3 + (45 + 6 * l * (l + 6)) * r ** 2 + (
+                        15 * (l + 3)) * r + 15
+        elif k == 4:
+            p = lambda r: (l + 7) * (l + 5) * (l + 3) * (l + 1) * r ** 4 + (
+                        5 * (l + 4) * (21 + 2 * l * (8 + l))) * r ** 3 + (45 * (14 + l * (l + 8))) * r ** 2 + (
+                                      105 * (l + 4)) * r + 105
+        else:
+            raise Exception('This Wendland kernel is not implemented')
+        c = np.math.factorial(l + 2 * k) / np.math.factorial(l)
+        e = l + k
+        # self.rbf = lambda ep, r: np.maximum(1 - ep * r, 0) ** e * p(ep * r) / c
+        self.rbf = lambda ep, r: torch.clamp(1 - ep * r, min=0) ** e * p(ep * r) / c
+
+
+#  # Polynomial kernels
+# class Polynomial(Kernel):
+#     def __init__(self, a=0, p=1):
+#         self.a = a
+#         self.p = p
+#
+#     def eval(self, x, y):
+#         return (np.atleast_2d(x) @ np.atleast_2d(y).transpose() + self.a) ** self.p
+#
+#     def diagonal(self, X):
+#         return ((np.linalg.norm(X, axis=1) + self.a) ** self.p)[:, None]
+#
+#     def __str__(self):
+#      return 'polynomial' + ' [a = %2.2e, p = %2.2e]' % (self.a, self.p)
+#
+#     def set_params(self, par):
+#         self.a = par[0]
+#         self.p = par[1]
+
+# A demo usage
+def main():
+    ker = Gaussian()
+
+    x = torch.linspace(-5, 5, 10001)[:, None]
+    y = torch.zeros(1, 1)
+    A = ker.eval(x, y)
+    B = torch.exp(-x ** 2)
+
+    fig = plt.figure(1)
+    fig.clf()
+    ax = fig.gca()
+    ax.plot(x, A)
+    ax.plot(x, B)
+    ax.set_title('A kernel: ' + str(ker))
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/vkoga.py b/utils/vkoga.py
new file mode 100644
index 0000000000000000000000000000000000000000..1731605560da4189f8cb88ab5961f5bf51742a09
--- /dev/null
+++ b/utils/vkoga.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+
+from vkoga.kernels import Gaussian
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from scipy.spatial import distance_matrix
+    
+# VKOGA implementation
+class VKOGA(BaseEstimator):
+                                          
+    def __init__(self, kernel=Gaussian(), kernel_par=1,
+                 verbose=True, n_report=10,
+                 greedy_type='p_greedy', reg_par=0, restr_par=0, 
+                 tol_f=1e-10, tol_p=1e-10):
+        
+        # Set the verbosity on/off
+        self.verbose = verbose
+        
+        # Set the frequency of report
+        self.n_report = n_report
+        
+        # Set the params defining the method 
+        self.kernel = kernel
+        self.kernel_par = kernel_par
+        self.greedy_type = greedy_type
+        self.reg_par = reg_par
+        self.restr_par = restr_par
+
+        self.flag_val = None
+        
+        # Set the stopping values
+        self.tol_f = tol_f
+        self.tol_p = tol_p
+
+        # Some further settings
+        self.ctrs_ = None
+        self.Cut_ = None
+        self.c = None
+
+
+        # Initialize the convergence history
+        self.train_hist = {}
+        self.train_hist['n'] = []
+        self.train_hist['f'] = []
+        self.train_hist['p'] = []
+        self.train_hist['p selected'] = []              # list of selected power vals
+        self.train_hist['f val'] = []
+        self.train_hist['p val'] = []
+        
+    def selection_rule(self, f, p):
+        if self.restr_par > 0:
+            p_ = np.max(p)
+            restr_idx = np.nonzero(p >= self.restr_par * p_)[0]
+        else:
+            restr_idx = np.arange(len(p))
+
+        f = np.sum(f ** 2, axis=1)
+        if self.greedy_type == 'f_greedy':
+            idx = np.argmax(f[restr_idx])
+            idx = restr_idx[idx]
+            f_max = np.max(f)
+            p_max = np.max(p)
+        elif self.greedy_type == 'fp_greedy':
+            idx = np.argmax(f[restr_idx] / p[restr_idx])
+            idx = restr_idx[idx]
+            f_max = np.max(f)
+            p_max = np.max(p)
+        elif self.greedy_type == 'p_greedy':
+            f_max = np.max(f)
+            idx = np.argmax(p)
+            p_max = p[idx]
+        elif self.greedy_type == 'rand':    # pick some random point - David Holzmüller asked me about its performance
+            f_max = np.max(f)
+            p_max = np.max(p)
+            idx = np.random.randint(len(p))
+        return idx, f_max, p_max
+
+    def fit(self, X, y, X_val=None, y_val=None, maxIter=None):
+        # Check the datasets
+        X, y = check_X_y(X, y, multi_output=True)
+        if len(y.shape) == 1:
+            y = y[:, None]
+
+        if X_val is None or y_val is None:
+            X_val = None
+            y_val = None
+            self.flag_val = False
+        else:
+            self.flag_val = True
+            X_val, y_val = check_X_y(X_val, y_val, multi_output=True)
+            # We will assume in the following that X_val and X are disjoint
+
+            if len(y_val.shape) == 1:
+                y_val = y_val[:, None]
+
+        # Check whether already fitted
+        if self.ctrs_ is None:
+            self.ctrs_ = np.zeros((0, X.shape[1]))
+            self.Cut_ = np.zeros((0, 0))
+            self.c = np.zeros((0, y.shape[1]))
+
+
+        # Check whether "new X" and previously chosen centers overlap
+        list_truly_new_X = []
+        if self.ctrs_.shape[0] > 0:
+            for idx_x in range(X.shape[0]):
+                if min(np.linalg.norm(self.ctrs_ - X[idx_x, :], axis=1)) < 1e-10:
+                    continue
+                else:
+                    list_truly_new_X.append(idx_x)
+        else:
+            list_truly_new_X = list(range(X.shape[0]))
+        X = X[list_truly_new_X, :]
+        y = y[list_truly_new_X, :]
+
+
+
+        # Initialize the residual and update the given y values by substracting the current model
+        y = np.array(y)
+        if len(y.shape) == 1:
+            y = y[:, None]
+        y = y - self.predict(X)
+        if self.flag_val:
+            y_val = y_val - self.predict(X_val)
+
+
+        # Get the data dimension
+        N, q = y.shape
+        if self.flag_val:
+            N_val = y_val.shape[0]
+
+
+        # Set maxIter_continue
+        if maxIter is None or maxIter > N:
+            self.maxIter = 100
+        else:
+            self.maxIter = maxIter
+
+
+        # Check compatibility of restriction
+        if self.greedy_type == 'p_greedy':
+            self.restr_par = 0
+        if not self.reg_par == 0:
+            self.restr_par = 0
+
+
+        # Initialize list for the chosen and non-chosen indices
+        indI_ = []
+        notIndI = list(range(N))
+        c = np.zeros((self.maxIter, q))
+
+
+        # Compute the Newton basis values (related to the old centers) on the new X
+        Vx_new_X_old_ctrs = self.kernel.eval(X, self.ctrs_) @ self.Cut_.transpose()
+        if self.flag_val:
+            Vx_val_new_X_old_ctrs = self.kernel.eval(X_val, self.ctrs_) @ self.Cut_.transpose()
+
+
+        # Initialize arrays for the Newton basis values (related to the new centers) on the new X
+        Vx = np.zeros((N, self.maxIter))
+        if self.flag_val:
+            Vx_val = np.zeros((N_val, self.maxIter))
+
+
+        # Compute the powervals on X and X_val
+        p = self.kernel.diagonal(X) + self.reg_par
+        p = p - np.sum(Vx_new_X_old_ctrs ** 2, axis=1)
+        if self.flag_val:
+            p_val = self.kernel.diagonal(X_val) + self.reg_par
+            p_val = p_val - np.sum(Vx_val_new_X_old_ctrs ** 2, axis=1)
+
+
+        # Extend Cut_ matrix, i.e. continue to build on old self.Cut_ matrix
+        N_ctrs_so_far = self.Cut_.shape[0]
+        Cut_ = np.zeros((N_ctrs_so_far + self.maxIter, N_ctrs_so_far + self.maxIter))
+        Cut_[:N_ctrs_so_far, :N_ctrs_so_far] = self.Cut_
+
+
+        # Iterative selection of new points
+        self.print_message('begin')
+        for n in range(self.maxIter):
+            # prepare
+            self.train_hist['n'].append(self.ctrs_.shape[0] + n + 1)
+            self.train_hist['f'].append([])
+            self.train_hist['p'].append([])
+            self.train_hist['p selected'].append([])
+            if self.flag_val:
+                self.train_hist['p val'].append([])
+                self.train_hist['f val'].append([])
+
+            # select the current index
+            idx, self.train_hist['f'][-1], self.train_hist['p'][-1] = self.selection_rule(y[notIndI], p[notIndI])
+            self.train_hist['p selected'][-1] = p[notIndI[idx]]
+            if self.flag_val:
+                self.train_hist['p val'][-1] = np.max(p_val)
+                self.train_hist['f val'][-1] = np.max(np.sum(y_val ** 2, axis=1))
+
+            # add the current index
+            indI_.append(notIndI[idx])
+
+            # check if the tolerances are reacheded
+            if self.train_hist['f'][n] <= self.tol_f:
+                n = n - 1
+                self.print_message('end')
+                break
+            if self.train_hist['p'][n] <= self.tol_p:
+                n = n - 1
+                self.print_message('end')
+                break
+
+            # compute the nth basis (including normalization)# ToDo: Also old Vx need to be substracted here!
+            Vx[notIndI, n] = self.kernel.eval(X[notIndI, :], X[indI_[n], :])[:, 0]\
+                 - Vx_new_X_old_ctrs[notIndI, :] @ Vx_new_X_old_ctrs[indI_[n], :].transpose()\
+                 - Vx[notIndI, :n+1] @ Vx[indI_[n], 0:n+1].transpose()
+            Vx[indI_[n], n] += self.reg_par
+            Vx[notIndI, n] = Vx[notIndI, n] / np.sqrt(p[indI_[n]])
+
+            if self.flag_val:
+                Vx_val[:, n] = self.kernel.eval(X_val, X[indI_[n], :])[:, 0]\
+                    - Vx_val_new_X_old_ctrs[:, :] @ Vx_new_X_old_ctrs[indI_[n], :].transpose()\
+                    - Vx_val[:, :n+1] @ Vx[indI_[n], 0:n+1].transpose()
+                Vx_val[:, n] = Vx_val[:, n] / np.sqrt(p[indI_[n]])
+
+
+            # update the change of basis
+            Cut_new_row = np.ones(N_ctrs_so_far + n + 1)
+            Cut_new_row[:N_ctrs_so_far + n] = \
+                -np.concatenate((Vx_new_X_old_ctrs[indI_[n], :], Vx[indI_[n], :n])) \
+                @ Cut_[:N_ctrs_so_far + n, :N_ctrs_so_far + n]
+            Cut_[N_ctrs_so_far + n, :N_ctrs_so_far + n + 1] = Cut_new_row / Vx[indI_[n], n]
+
+            # compute the nth coefficient
+            c[n] = y[indI_[n]] / np.sqrt(p[indI_[n]])
+
+            # update the power function
+            p[notIndI] = p[notIndI] - Vx[notIndI, n] ** 2
+            if self.flag_val:
+                p_val = p_val - Vx_val[:, n] ** 2
+
+            # update the residual
+            y[notIndI] = y[notIndI] - Vx[notIndI, n][:, None] * c[n]
+            if self.flag_val:
+                y_val = y_val - Vx_val[:, n][:, None] * c[n]
+
+            # remove the nth index from the dictionary
+            notIndI.pop(idx)
+
+            # Report some data every now and then
+            if n % self.n_report == 0:
+                self.print_message('track')
+
+        else:
+            self.print_message('end')
+
+        # Define coefficients and centers
+        self.c =  np.concatenate((self.c, c[:n + 1]))
+        self.Cut_ = Cut_[:N_ctrs_so_far + n + 1, :N_ctrs_so_far + n + 1]
+        self.indI_ = indI_[:n + 1]     # Mind: These are only the indices of the latest points
+        self.coef_ = self.Cut_.transpose() @ self.c
+        self.ctrs_ = np.concatenate((self.ctrs_, X[self.indI_, :]), axis=0)
+
+
+        return self
+
+
+    def predict(self, X):
+        # Check is fit has been called
+        # check_is_fitted(self, 'coef_')     # ToDo: Remove this one!
+
+        # Validate the input
+        X = check_array(X)
+
+        # Compute prediction
+        if self.ctrs_.shape[0] > 0:
+            prediction = self.kernel.eval(X, self.ctrs_) @ self.coef_
+        else:
+            prediction = np.zeros((X.shape[0], 1))
+
+        return prediction
+        ### TODO: replace with eval prod
+
+    def predict_P(self, X, n=None):
+        # Prediction of the power function, copied from pgreedy.py
+
+        # Try to do nothing
+        if self.ctrs_ is None or n == 0:
+            return self.kernel.diagonal(X)
+
+        # Otherwise check if everything is ok
+        # Check is fit has been called
+        check_is_fitted(self, 'ctrs_')
+        # Validate the input
+        X = check_array(X)
+
+        # Decide how many centers to use
+        if n is None:
+            n = np.atleast_2d(self.ctrs_).shape[0]
+
+        # Evaluate the power function on the input
+        p = self.kernel.diagonal(X) - np.sum(
+            (self.kernel.eval(X, np.atleast_2d(self.ctrs_)[:n]) @ self.Cut_[:n, :n].transpose()) ** 2, axis=1)
+
+        return p
+
+    def print_message(self, when):
+        if self.verbose and when == 'begin':
+            print('')
+            print('*' * 30 + ' [VKOGA] ' + '*' * 30)
+            print('Training model with')
+            print('       |_ kernel              : %s' % self.kernel)
+            print('       |_ regularization par. : %2.2e' % self.reg_par)
+            print('       |_ restriction par.    : %2.2e' % self.restr_par)
+            print('')
+            
+        if self.verbose and when == 'end':
+            print('Training completed with')
+            print('       |_ selected points     : %8d / %8d' % (self.train_hist['n'][-1], self.ctrs_.shape[0] + self.maxIter))
+            if self.flag_val:
+                print('       |_ train, val residual : %2.2e / %2.2e,    %2.2e' %
+                      (self.train_hist['f'][-1], self.tol_f, self.train_hist['f val'][-1]))
+                print('       |_ train, val power fun: %2.2e / %2.2e,    %2.2e' %
+                      (self.train_hist['p'][-1], self.tol_p, self.train_hist['p val'][-1]))
+            else:
+                print('       |_ train residual      : %2.2e / %2.2e' % (self.train_hist['f'][-1], self.tol_f))
+                print('       |_ train power fun     : %2.2e / %2.2e' % (self.train_hist['p'][-1], self.tol_p))
+                        
+        if self.verbose and when == 'track':
+            print('Training ongoing with')
+            print('       |_ selected points     : %8d / %8d' % (self.train_hist['n'][-1], self.ctrs_.shape[0] + self.maxIter))
+            if self.flag_val:
+                print('       |_ train, val residual : %2.2e / %2.2e,    %2.2e' %
+                      (self.train_hist['f'][-1], self.tol_f, self.train_hist['f val'][-1]))
+                print('       |_ train, val power fun: %2.2e / %2.2e,    %2.2e' %
+                      (self.train_hist['p'][-1], self.tol_p, self.train_hist['p val'][-1]))
+            else:
+                print('       |_ train residual      : %2.2e / %2.2e' % (self.train_hist['f'][-1], self.tol_f))
+                print('       |_ train power fun     : %2.2e / %2.2e' % (self.train_hist['p'][-1], self.tol_p))
+
+
+
+#%% Utilities to 
+import pickle
+def save_object(obj, filename):
+    with open(filename, 'wb') as output:
+        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
+
+def load_object(filename):        
+    with open(filename, 'rb') as input:
+        obj = pickle.load(input)    
+    return obj
+
+
+