From 2bb57fcc7ec8e700c59fbc0880a2cc30d9a45253 Mon Sep 17 00:00:00 2001
From: Tizian Wenzel <wenzeltn@nbanm02.mathematik.uni-stuttgart.de>
Date: Thu, 8 Jun 2023 13:57:01 +0200
Subject: [PATCH] Some more updates.

---
 download_data.py                            |   5 +
 section_4.2_compute.py                      |   4 +-
 setup.sh                                    |   7 +-
 {utils => utils_code}/cv_rippa_ext.py       |   0
 {utils => utils_code}/dataset_collection.py |   0
 {utils => utils_code}/hyperparameters.py    |   0
 {utils => utils_code}/kernels.py            |   0
 {utils => utils_code}/main_function.py      |  10 +-
 {utils => utils_code}/optimized_kernel.py   |   0
 {utils => utils_code}/tkernels.py           |   0
 {utils => utils_code}/vkoga.py              |   2 +-
 utils_data/README.md                        |   7 +
 utils_data/custom_paths.py                  |  26 ++
 utils_data/data.py                          |  35 ++
 utils_data/download_data.py                 | 446 ++++++++++++++++++++
 utils_data/utils.py                         | 303 +++++++++++++
 16 files changed, 831 insertions(+), 14 deletions(-)
 create mode 100644 download_data.py
 rename {utils => utils_code}/cv_rippa_ext.py (100%)
 rename {utils => utils_code}/dataset_collection.py (100%)
 rename {utils => utils_code}/hyperparameters.py (100%)
 rename {utils => utils_code}/kernels.py (100%)
 rename {utils => utils_code}/main_function.py (98%)
 rename {utils => utils_code}/optimized_kernel.py (100%)
 rename {utils => utils_code}/tkernels.py (100%)
 rename {utils => utils_code}/vkoga.py (99%)
 create mode 100644 utils_data/README.md
 create mode 100644 utils_data/custom_paths.py
 create mode 100644 utils_data/data.py
 create mode 100644 utils_data/download_data.py
 create mode 100644 utils_data/utils.py

diff --git a/download_data.py b/download_data.py
new file mode 100644
index 0000000..2ec0e48
--- /dev/null
+++ b/download_data.py
@@ -0,0 +1,5 @@
+from utils_data.download_data import import_all
+
+if __name__ == '__main__':
+    import_all()
+    pass
\ No newline at end of file
diff --git a/section_4.2_compute.py b/section_4.2_compute.py
index d40c028..a66082e 100644
--- a/section_4.2_compute.py
+++ b/section_4.2_compute.py
@@ -4,8 +4,8 @@
 # sets", especially to produce the results for the Figures 3-5.
 
 
-from utils.main_function import run_everything, run_cross_validation
-from utils.hyperparameters import dic_hyperparams
+from utils_code.main_function import run_everything, run_cross_validation
+from utils_code.hyperparameters import dic_hyperparams
 
 from scipy import io
 import os
diff --git a/setup.sh b/setup.sh
index ddd3cd6..3b14bc0 100755
--- a/setup.sh
+++ b/setup.sh
@@ -16,12 +16,7 @@ source venv/bin/activate
 pip install --upgrade pip
 pip3 install --ignore-installed torch==1.9.1+cpu torchvision==0.10.1+cpu torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
 
-pip3 install scipy==1.7.3
-pip3 install scikit-learn==1.0.2
-
-
-# Install further stuff
-pip install matplotlib
+pip3 install -r requirements.txt
 
 
 # pip freeze > requirements.txt	
diff --git a/utils/cv_rippa_ext.py b/utils_code/cv_rippa_ext.py
similarity index 100%
rename from utils/cv_rippa_ext.py
rename to utils_code/cv_rippa_ext.py
diff --git a/utils/dataset_collection.py b/utils_code/dataset_collection.py
similarity index 100%
rename from utils/dataset_collection.py
rename to utils_code/dataset_collection.py
diff --git a/utils/hyperparameters.py b/utils_code/hyperparameters.py
similarity index 100%
rename from utils/hyperparameters.py
rename to utils_code/hyperparameters.py
diff --git a/utils/kernels.py b/utils_code/kernels.py
similarity index 100%
rename from utils/kernels.py
rename to utils_code/kernels.py
diff --git a/utils/main_function.py b/utils_code/main_function.py
similarity index 98%
rename from utils/main_function.py
rename to utils_code/main_function.py
index c1379c9..2a2b318 100644
--- a/utils/main_function.py
+++ b/utils_code/main_function.py
@@ -1,16 +1,16 @@
 
 
-from utils.optimized_kernel import OptimizedKernel
-from utils.dataset_collection import Dataset
-from utils.hyperparameters import dic_hyperparams
+from utils_code.optimized_kernel import OptimizedKernel
+from utils_code.dataset_collection import Dataset
+from utils_code.hyperparameters import dic_hyperparams
 
-from utils import tkernels, kernels
+from utils_code import tkernels, kernels
 import torch
 from matplotlib import pyplot as plt
 import numpy as np
 import time
 
-from utils.vkoga import VKOGA
+from utils_code.vkoga import VKOGA
 import os
 
 from datetime import datetime
diff --git a/utils/optimized_kernel.py b/utils_code/optimized_kernel.py
similarity index 100%
rename from utils/optimized_kernel.py
rename to utils_code/optimized_kernel.py
diff --git a/utils/tkernels.py b/utils_code/tkernels.py
similarity index 100%
rename from utils/tkernels.py
rename to utils_code/tkernels.py
diff --git a/utils/vkoga.py b/utils_code/vkoga.py
similarity index 99%
rename from utils/vkoga.py
rename to utils_code/vkoga.py
index 5e7fd4b..28e54e5 100644
--- a/utils/vkoga.py
+++ b/utils_code/vkoga.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-from utils.kernels import Gaussian
+from utils_code.kernels import Gaussian
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
diff --git a/utils_data/README.md b/utils_data/README.md
new file mode 100644
index 0000000..1f934de
--- /dev/null
+++ b/utils_data/README.md
@@ -0,0 +1,7 @@
+Code taken from https://github.com/dholzmueller/bmdal_reg/tree/main
+Copyright by David Holzmüller
+
+
+
+
+
diff --git a/utils_data/custom_paths.py b/utils_data/custom_paths.py
new file mode 100644
index 0000000..587e94b
--- /dev/null
+++ b/utils_data/custom_paths.py
@@ -0,0 +1,26 @@
+# This file allows to configure where to save data, results, plots etc.
+class CustomPaths:
+    # path where downloaded data sets will be saved
+    data_path = 'data'
+    # path where benchmark results will be saved
+    results_path = 'results'
+    # path where plots and tables will be saved
+    plots_path = 'plots'
+    # path where benchmark results can be cached in a more efficient format such that they load faster
+    cache_path = 'cache'
+
+
+def get_data_path():
+    return CustomPaths.data_path
+
+
+def get_results_path():
+    return CustomPaths.results_path
+
+
+def get_plots_path():
+    return CustomPaths.plots_path
+
+
+def get_cache_path():
+    return CustomPaths.cache_path
diff --git a/utils_data/data.py b/utils_data/data.py
new file mode 100644
index 0000000..4858363
--- /dev/null
+++ b/utils_data/data.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+from typing import *
+
+from utils_data import custom_paths
+from utils_data import utils
+
+
+class DataInfo:
+    """
+    Represents information about a data set.
+    """
+    def __init__(self, ds_name: str, n_tvp: int, n_test: int, n_features: int,
+                 train_test_split: Optional[int]):
+        """
+        :param ds_name: Name of the data set.
+        :param n_tvp: Number of train+val+pool samples.
+        :param n_test: Number of test samples.
+        :param n_features: Number of input features of the data set.
+        :param train_test_split: Set to None if the data set does not have a fixed (train+val+pool)-test split.
+        If this is an int, it is interpreted such that the train+val+pool set are the first train_test_split samples
+        and the test set are the remaining samples.
+        """
+        # tvp = train+val+pool
+        self.ds_name = ds_name
+        self.n_features = n_features
+        self.n_tvp = n_tvp
+        self.n_test = n_test
+        self.n_samples = n_tvp + n_test
+        self.train_test_split = train_test_split
+
+    def save(self):
+        """
+        Saves this object to the path of the corresponding data set.
+        """
+        utils.serialize(Path(custom_paths.get_data_path()) / 'data' / self.ds_name / 'data_info.pkl', self)
\ No newline at end of file
diff --git a/utils_data/download_data.py b/utils_data/download_data.py
new file mode 100644
index 0000000..79497a0
--- /dev/null
+++ b/utils_data/download_data.py
@@ -0,0 +1,446 @@
+import pandas as pd
+import numpy as np
+import requests
+import shutil
+from typing import *
+import openml
+import mat4py
+
+from utils_data import custom_paths
+from utils_data import utils
+from utils_data.data import DataInfo
+
+
+
+def download_if_not_exists(url: str, dest: str):
+    """
+    Simple function for downloading a file from an url if no file at the destination path exists.
+    :param url: URL of the file to download.
+    :param dest: Path where to save the downloaded file.
+    """
+    # following https://dzone.com/articles/simple-examples-of-downloading-files-using-python
+    utils.ensureDir(dest)
+    if not utils.existsFile(dest):
+        print('Downloading ' + url, flush=True)
+        # file = requests.get(url)
+        # open(dest, 'wb').write(file.content)
+        r = requests.get(url, stream=True)
+        with open(dest, 'wb') as f:
+            print('Progress (dot = 1 MB): ', end='', flush=True)
+            for ch in r.iter_content(chunk_size=1024**2):
+                print('.', end='', flush=True)
+                f.write(ch)
+            print(flush=True)
+
+
+class PandasTask:
+    """
+    This class represents a task (data set with indicated target variable) given by Pandas DataFrames.
+    Additionally, a dedicated train-test split can be specified
+    and the name of the data set needs to be specified for saving.
+    This class provides a variety of methods for altering the task by different preprocessing methods.
+    """
+    def __init__(self, x_df: pd.DataFrame, y_df: pd.Series, ds_name: str, cat_indicator: Optional[List[bool]] = None,
+                 train_test_split: Optional[int] = None):
+        """
+        :param x_df: DataFrame containing the inputs (covariates).
+        :param y_df: pd.Series containing the targets.
+        :param ds_name: Name for saving the data set.
+        :param cat_indicator: Optional.
+        One may specify a list of booleans which indicate whether each column of x is a category (True) or not (False).
+        Otherwise, the column types in x_df will be used to decide whether a column is categorical or not.
+        :param train_test_split: Optional. An integer can be specified as the index of the first test sample,
+        if the data set has a dedicated test set part at the end.
+        """
+        if cat_indicator is None:
+            cat_indicator = [not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]]) for i in range(len(x_df.columns))]
+        else:
+            # this is a fix coming from a different codebase
+            # because category_indicator[0] was False for the dataset MIP-2016-regression
+            # despite the column being categorical  (dtype=object)
+            cat_indicator = [v or not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]])
+                             for i, v in enumerate(cat_indicator)]
+        if len(x_df.columns) != len(cat_indicator):
+            raise ValueError('x.shape[1] != len(cat_indicator)')
+
+        self.x_df = x_df  # should be (sparse) pd.DataFrame
+        # should be (sparse) pd.Series  (i.e. a single column of a DataFrame)
+        self.y_df = y_df
+        self.ds_name = ds_name
+
+        self.cat_cols = [x_df.columns[i] for i in range(len(x_df.columns)) if cat_indicator[i]]
+        self.cont_cols = [x_df.columns[i] for i in range(len(x_df.columns)) if not cat_indicator[i]]
+        self.train_test_split = train_test_split
+
+    def get_n_samples(self):
+        """
+        :return: Returns the number of samples (number of rows in the DataFrame).
+        """
+        return len(self.x_df)
+
+    def remove_missing_cont(self):
+        """
+        Removes rows with missing values in continuous columns.
+        """
+        print('removing columns with missing continuous values')
+        if len(self.cont_cols) == 0:
+            return  # no continuous columns
+
+        not_nan_rows = self.x_df.notna().all(axis=1)
+        self.x_df = self.x_df.loc[not_nan_rows, :]
+        self.y_df = self.y_df.loc[not_nan_rows]
+
+    def normalize_regression_y(self):
+        """
+        Centers and standardizes the target variable.
+        """
+        print('normalizing regression y')
+        y_np = np.asarray(self.y_df)
+        self.y_df.loc[:] = (y_np - np.mean(y_np)) / (np.std(y_np) + 1e-30)
+
+    def subsample_dfs_(self, dfs: List[pd.DataFrame], max_n_samples: int) -> List[pd.DataFrame]:
+        """
+        Internal method for jointly subsampling multiple Pandas DataFrames of the same number of rows.
+        :param dfs: Data Frames.
+        :param max_n_samples: Maximum number of remaining rows.
+        :return: Returns a List of potentially subsampled Pandas DataFrames.
+        """
+        if len(dfs[0]) <= max_n_samples:
+            return dfs
+        print(f'subsampling from {len(dfs[0])} samples to {max_n_samples}')
+        idxs = np.random.default_rng(12345).permutation(len(dfs[0]))[:max_n_samples]
+        return [df.iloc[idxs] for df in dfs]
+
+    def subsample(self, max_tvp_samples: int, max_test_samples: int):
+        """
+        Subsamples the data set if necessary to not exceed a given maximum size.
+        :param max_tvp_samples: Maximum number of train+val+pool samples.
+        :param max_test_samples: Maximum number of test samples.
+        """
+        if self.train_test_split is not None:
+            dfs_train = self.subsample_dfs_([self.x_df.loc[:self.train_test_split],
+                                             self.y_df.loc[:self.train_test_split]], max_n_samples=max_tvp_samples)
+            dfs_test = self.subsample_dfs_([self.x_df.loc[self.train_test_split:],
+                                            self.y_df.loc[self.train_test_split:]], max_n_samples=max_test_samples)
+            self.train_test_split = len(dfs_train[0])
+            self.x_df = pd.concat([dfs_train[0], dfs_test[0]], axis=0)
+            self.y_df = pd.concat([dfs_train[1], dfs_test[1]], axis=0)
+        else:
+            dfs = self.subsample_dfs_([self.x_df, self.y_df], max_n_samples=max_tvp_samples + max_test_samples)
+            self.x_df, self.y_df = dfs[0], dfs[1]
+
+    def remove_constant_columns(self):
+        """
+        Removes columns with only a single value (this could happen after removing NaN values).
+        """
+        # taken from https://stackoverflow.com/questions/20209600/pandas-dataframe-remove-constant-column
+        non_constant_columns = (self.x_df != self.x_df.iloc[0]).any()
+        print(f'removing constant columns')
+        self.x_df = self.x_df.loc[:, non_constant_columns]
+        self.cat_cols = [key for key in self.cat_cols if key in self.x_df.columns]
+        self.cont_cols = [key for key in self.cont_cols if key in self.x_df.columns]
+
+    def one_hot_encode(self, max_one_hot_columns: int):
+        """
+        Applies one-hot encoding to categorical columns.
+        :param max_one_hot_columns: Maximal number of allowed one-hot encoded columns.
+        If more one-hot encoded columns would be generated,
+        the categorical columns with the largest number of categories are not one-hot encoded.
+        """
+        cat_cols_with_size = [(col_name, self.x_df.loc[:, col_name].nunique())
+                              for i, col_name in enumerate(self.cat_cols)]
+        if len(cat_cols_with_size) == 0:
+            return  # nothing to encode
+        print('one-hot encoding columns')
+        cat_cols_with_size.sort(key=lambda t: t[1])  # sort by size of category
+        max_cat_size = cat_cols_with_size[-1][1]
+        new_col_sum = 0
+        for key, sz in cat_cols_with_size:
+            new_col_sum += sz
+            if new_col_sum > max_one_hot_columns:
+                max_cat_size = sz-1
+                break
+
+        new_cat_cols = []
+
+        for key, sz in cat_cols_with_size:
+            if sz <= max_cat_size:
+                print(f'one-hot encoding column {key} with {sz} unique elements')
+                # following https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
+                # https://queirozf.com/entries/one-hot-encoding-a-feature-on-a-pandas-dataframe-an-example
+                col = self.x_df[key].astype('category')
+                dummies = pd.get_dummies(col, prefix=f'{key}_onehot_', drop_first=(sz == 2), dummy_na=True,
+                                         dtype=np.int32)
+                self.cont_cols.extend(list(dummies.columns))
+                self.x_df.drop([key], axis=1, inplace=True)
+                self.x_df = pd.concat([self.x_df, dummies], axis=1)
+            else:
+                new_cat_cols.append(key)
+                print(f'categorical column {key} with {sz} unique values is not one-hot encoded due to size constraints')
+
+        self.cat_cols = new_cat_cols
+
+    def save(self, n_test: int):
+        """
+        Saves the data set in the folder f'{custom_paths.get_data_path()}/data/{self.ds_name}'.
+        :param n_test: Desired number of test samples.
+        """
+        folder = f'{custom_paths.get_data_path()}/data/{self.ds_name}'
+        x_cont = np.array(self.x_df.reindex(columns=self.cont_cols), dtype=np.float32)
+        y = np.array(self.y_df, dtype=np.float32)
+        n_tvp = self.get_n_samples() - n_test
+        data_info = DataInfo(ds_name=self.ds_name, n_tvp=n_tvp, n_test=n_test,
+                             n_features=x_cont.shape[1],
+                             train_test_split=self.train_test_split)
+
+        utils.serialize(f'{folder}/data_info.pkl', data_info)
+        np.save(f'{folder}/X.npy', x_cont)
+        np.save(f'{folder}/y.npy', y[:, None])
+
+        # ds = DictDataset({'x_cont': torch.as_tensor(x_cont), 'x_cat': torch.as_tensor(x_cat),
+        #                   'y': torch.as_tensor(y[:, None])},
+        #                  {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]),
+        #                   'x_cat': TensorInfo(cat_sizes=cat_sizes),
+        #                   'y': TensorInfo(cat_sizes=[self.get_n_classes()])})
+        # task_info = TaskInfo.from_ds(task_desc, ds)
+        # return Task(task_info, ds)
+
+    @staticmethod
+    def from_openml_task_id(task_id: int, ds_name: str):
+        """
+        Creates a PandasTask from an OpenML task.
+        If more options are desired, we refer to PandasTask.from_openml_dataset_id() instead.
+        :param task_id: OpenML task id.
+        :param ds_name: Short name of the data set for saving.
+        :return: Returns a PandasTask representing the OpenML task.
+        """
+        task = openml.tasks.get_task(task_id, download_data=False)
+        return PandasTask.from_openml_dataset_id(task.dataset_id, ds_name, task.target_name)
+
+    @staticmethod
+    def from_openml_dataset_id(dataset_id: int, ds_name: str, target: str,
+                               ignore_columns: Optional[List[str]] = None,
+                               use_log_target: bool = False):
+        """
+        Creates a PandasTask from an OpenML data set.
+        :param dataset_id: OpenML data set id.
+        :param ds_name: Short name of the data set for saving.
+        :param target: Name of the target variable.
+        :param ignore_columns: Optional. List of columns that should be removed.
+        :param use_log_target: Whether the logarithm should be applied to the target column.
+        :return: Returns a PandasTask representing the corresponding OpenML data set.
+        """
+        print(f'Importing dataset {ds_name}')
+        openml.config.set_cache_directory('./openml_cache')
+        dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
+        print(f'dataset name: {dataset.name}')
+        # print(dataset.get_data(dataset_format='dataframe'))
+        x_df, y_df, cat_indicator, names = dataset.get_data(target=target,
+                                                            dataset_format='dataframe')
+
+        if ignore_columns is not None:
+            cat_indicator = [value for col_name, value in zip(x_df.columns, cat_indicator)
+                                  if col_name not in ignore_columns]
+
+        for key in ignore_columns or []:
+            x_df.drop([key], axis=1, inplace=True)
+
+        if use_log_target:
+            y_df = np.log(y_df)
+
+        print('Imported x_df:\n', x_df)
+        print('Imported y_df:\n', y_df)
+
+        return PandasTask(x_df, y_df, ds_name, cat_indicator)
+
+    @staticmethod
+    def from_uci(url: str, ds_name: str, zip_name: str, csv_name: str, target_col_idxs: List[int],
+                 ignore_col_idxs: Optional[List[int]] = None, use_log_target: bool = False,
+                 train_test_boundary: Optional[int] = None, has_header: bool = True,
+                 continuous_nan_columns: Optional[List[int]] = None,
+                 convert_to_cat_columns: Optional[List[int]] = None,
+                 ignore_other_csv_files: bool = False, separator: Optional[str] = None):
+        """
+        Create a PandasTask object from a data set on the UCI repository.
+        :param url: URL of the data set file.
+        :param ds_name: Short name of the data set used for saving the data set.
+        :param zip_name: Target name of the downloaded file.
+        If the downloaded file is not zip/compressed, i.e., no unzipping is needed,
+        zip_name should be the same as csv_name.
+        Otherwise the file with name zip_name will be unzipped to the file with name csv_name.
+        :param csv_name: Target name of the uncompressed file, see zip_name.
+        :param target_col_idxs: List of indexes of target columns. Mostly, this will only have one element.
+        :param ignore_col_idxs: List of indexes of columns to be removed.
+        :param use_log_target: Whether the logarithm should be applied to the target value.
+        :param train_test_boundary: Index of the first test sample.
+        If (as in most cases) there is no dedicated test set, None should be specified.
+        :param has_header: Whether the downloaded (unzipped) csv file has a row with column names that should be removed.
+        :param continuous_nan_columns: Optional. List of indexes of numeric columns that can contain NaN values.
+        This triggers a corresponding Pandas column conversion.
+        :param convert_to_cat_columns: Optional.
+        List of indexes of columns that should be converted to categorical type.
+        :param ignore_other_csv_files: If set to True,
+        unzipped csv files with file names other than csv_name will be ignored.
+        If set to False, unzipped csv files with file names other than csv_name
+        will be interpreted as a dedicated test set,
+        and the file with name csv_name will be interpreted as the train+val+pool set.
+        :param separator: Separator in the csv file. Default is ','.
+        If the file with name csv_name is a tsv file, separator='\t' should be specified.
+        :return: Returns a PandasTask object.
+        """
+        print(f'Importing dataset {ds_name}')
+        base_path = custom_paths.get_data_path()
+        raw_data_folder = f'{base_path}/raw_data/{ds_name}'
+        zip_file = f'{raw_data_folder}/{zip_name}'
+        csv_file = f'{raw_data_folder}/{csv_name}'
+        download_if_not_exists(url, zip_file)
+        if not utils.existsFile(csv_file):
+            print('Unpacking zip file...')
+            shutil.unpack_archive(zip_file, raw_data_folder)
+
+        if separator is None:
+            separator = ','
+
+        print('Processing csv data...')
+        if ignore_other_csv_files:
+            non_train_files = []
+        else:
+            non_train_files = [file for file in utils.matchFiles(f'{raw_data_folder}/*.csv') if file != csv_file]
+        df = pd.read_csv(csv_file, header='infer' if has_header else None, sep=separator)
+        if len(non_train_files) > 0:
+            train_test_boundary = len(df)
+            df = pd.concat([df] + [pd.read_csv(file, header='infer' if has_header else None, sep=separator)
+                                     for file in non_train_files])
+
+        if continuous_nan_columns is not None:
+            for col_idx in continuous_nan_columns:
+                df.iloc[:, col_idx] = pd.to_numeric(df.iloc[:, col_idx], errors='coerce')
+
+        if convert_to_cat_columns is not None:
+            for col_idx in convert_to_cat_columns:
+                df.iloc[:, col_idx] = df.iloc[:, col_idx].astype('category')
+
+        input_column_names = []
+        target_columns = []
+        for i in range(len(df.columns)):
+            if i in target_col_idxs:
+                target_columns.append(df.iloc[:, i].to_numpy().astype(np.float32))
+            elif ignore_col_idxs is None or i not in ignore_col_idxs:
+                input_column_names.append(df.columns[i])
+
+        y = np.median(np.stack(target_columns, axis=1), axis=1)
+        if use_log_target:
+            y = np.log(y)
+
+        # https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe
+        x_df = df.reindex(columns=input_column_names).reset_index(drop=True)
+        y_df = pd.DataFrame({'y': y})['y']
+
+        return PandasTask(x_df, y_df, ds_name,
+                          train_test_split=train_test_boundary)
+
+
+class PandasTaskPreprocessor:
+    """
+    This class allows to preprocess data sets given by PandasTask objects. Various options can be configured.
+    """
+    def __init__(self, min_n_samples: int, max_tvp_samples: int, max_test_samples: int, max_one_hot_columns: int):
+        """
+        :param min_n_samples: Minimum number of samples that a task must have
+        after removing missing values in continuous columns.
+        :param max_tvp_samples: Maximum number of samples for the train+val+pool sets.
+        :param max_test_samples: Maximum number of samples for the test set.
+        :param max_one_hot_columns: Maximum number of one-hot encoded columns that are allowed.
+        If more would be generated, the categorical variables with the largest category sizes are removed.
+        """
+        self.min_n_samples = min_n_samples
+        self.max_tvp_samples = max_tvp_samples
+        self.max_test_samples = max_test_samples
+        self.max_one_hot_columns = max_one_hot_columns
+
+    def apply(self, pd_task: PandasTask):
+        """
+        Apply preprocessing to a PandasTask with the options given in the constructor
+         and save the preprocessed data set under the name specified in the PandasTask.
+        :param pd_task: PandasTask object holding infomation about the unprocessed data set.
+        """
+        pd_task.remove_missing_cont()
+        if pd_task.get_n_samples() < self.min_n_samples:
+            print(f'Task {pd_task.ds_name} has only {pd_task.get_n_samples()} samples after removing missing values, '
+                  f'hence we discard it')
+            return
+        pd_task.subsample(max_tvp_samples=self.max_tvp_samples, max_test_samples=self.max_test_samples)
+        pd_task.remove_constant_columns()
+        pd_task.one_hot_encode(self.max_one_hot_columns)
+        pd_task.normalize_regression_y()
+        n_samples = pd_task.get_n_samples()
+        if pd_task.train_test_split is not None:
+            n_test = n_samples - pd_task.train_test_split
+        else:
+            n_test = max(int(0.2 * n_samples), n_samples - self.max_tvp_samples)
+        pd_task.save(n_test)
+
+
+def get_sarcos_pandas_task() -> PandasTask:
+    """
+    This is a separate function for downloading the sarcos data set, since it is not in the UCI / OpenML repositories.
+    """
+    print(f'Importing dataset sarcos')
+    base_path = custom_paths.get_data_path()
+    raw_data_folder = f'{base_path}/raw_data/sarcos'
+    file_path = f'{raw_data_folder}/sarcos_inv.mat'
+    download_if_not_exists('http://www.gaussianprocess.org/gpml/data/sarcos_inv.mat', file_path)
+    # don't download test data since test_data = train_data[::10]
+    # download_if_not_exists('http://www.gaussianprocess.org/gpml/data/sarcos_inv_test.mat',
+    #                        f'{raw_data_folder}/sarcos_inv_test.mat')
+    data = np.array(mat4py.loadmat(file_path)['sarcos_inv'])
+    return PandasTask(x_df=pd.DataFrame(data[:, :-7]), y_df=pd.Series(data[:, -7]), ds_name='sarcos')
+
+
+def import_all():
+    proc = PandasTaskPreprocessor(min_n_samples=30000, max_tvp_samples=200000, max_test_samples=300000,
+                                  max_one_hot_columns=300)
+
+    proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00440/sgemm_product_dataset.zip',
+                   ds_name='sgemm', zip_name='sgemm_product_dataset.zip', csv_name='sgemm_product.csv',
+                   target_col_idxs=[14, 15, 16, 17], use_log_target=True))
+    proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip',
+                   ds_name='ct', zip_name='slice_localization_data.zip', csv_name='slice_localization_data.csv',
+                   target_col_idxs=[385], ignore_col_idxs=[0]))
+    proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00221/Reaction%20Network%20(Undirected).data',
+                   ds_name='kegg_undir_uci', zip_name='kegg_undir_uci.csv', csv_name='kegg_undir_uci.csv',
+                   target_col_idxs=[26], ignore_col_idxs=[0], has_header=False, continuous_nan_columns=[4]))
+    # only use the Sydney part of the data set (could as well have used Adelaide, Perth or Tasmania)
+    proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00494/WECs_DataSet.zip',
+                   ds_name='wecs', zip_name='WECs_DataSet.zip', csv_name='WECs_DataSet/Sydney_Data.csv',
+                   target_col_idxs=[48], has_header=False, ignore_other_csv_files=True))
+    proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00335/online_video_dataset.zip',
+                                   ds_name='online_video', zip_name='online_video_dataset.zip',
+                                   csv_name='transcoding_mesurment.tsv', separator='\t', ignore_col_idxs=[0, 20],
+                                   target_col_idxs=[21], has_header=True, ignore_other_csv_files=True))
+    proc.apply(
+        PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00493/datasets.zip',
+                            ds_name='query_agg_count', zip_name='datasets.zip',
+                            csv_name='Datasets/Range-Queries-Aggregates.csv', ignore_col_idxs=[0, 6, 7],
+                            target_col_idxs=[5], has_header=True, ignore_other_csv_files=True))
+    proc.apply(
+        PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt',
+                            ds_name='road_network', zip_name='3D_spatial_network.txt',
+                            csv_name='3D_spatial_network.txt', ignore_col_idxs=[0],
+                            target_col_idxs=[3], has_header=False, ignore_other_csv_files=True))
+    # note: we use only the testing data here with random splits since the training data is so small
+    # and the testing data so large
+    proc.apply(
+        PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data',
+                            ds_name='poker', zip_name='data.csv', csv_name='data.csv',
+                            convert_to_cat_columns=list(range(10)),
+                            target_col_idxs=[10], has_header=False, ignore_other_csv_files=True))
+    proc.apply(get_sarcos_pandas_task())
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=1200, ds_name='stock', target='company10'))
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42454, ds_name='mlr_knn_rng', target='perf.logloss',
+                                                 ignore_columns=['perf.mmce']))
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42701, ds_name='methane', target='MM264',
+                                                 ignore_columns=['MM263', 'MM256']))
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42225, ds_name='diamonds', target='price'))
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=564, ds_name='fried', target='Y'))
+    proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42903, ds_name='protein', target='RMSD'))
\ No newline at end of file
diff --git a/utils_data/utils.py b/utils_data/utils.py
new file mode 100644
index 0000000..28c1b5a
--- /dev/null
+++ b/utils_data/utils.py
@@ -0,0 +1,303 @@
+import os
+import os.path
+import heapq
+import glob
+import gzip
+import shutil
+import copy
+import timeit
+import time
+import numpy as np
+from typing import *
+import dill
+import json
+import itertools
+
+
+def select_from_config(config, keys):
+    selected = {}
+    for key in keys:
+        if key in config:
+            selected[key] = config[key]
+    return selected
+
+
+def adapt_config(config, **kwargs):
+    new_config = copy.deepcopy(config)
+    for key, value in kwargs.items():
+        new_config[key] = value
+    return new_config
+
+
+def existsDir(directory):
+    if directory != '':
+        if not os.path.exists(directory):
+            return False
+    return True
+
+
+def existsFile(file_path):
+    return os.path.isfile(file_path)
+
+
+def ensureDir(file_path):
+    directory = os.path.dirname(file_path)
+    if directory != '':
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+
+
+def matchFiles(file_matcher):
+    return glob.glob(file_matcher)
+
+
+def newDirname(prefix):
+    i = 0
+    name = prefix
+    if existsDir(prefix):
+        while existsDir(prefix + "_" + str(i)):
+            i += 1
+        name = prefix + "_" + str(i)
+    os.makedirs(name)
+    return name
+
+
+def getSubfolderNames(folder):
+    return [os.path.basename(name)
+            for name in os.listdir(folder)
+            if os.path.isdir(os.path.join(folder, name))]
+
+
+def getSubfolders(folder):
+    return [os.path.join(folder, name)
+            for name in os.listdir(folder)
+            if os.path.isdir(os.path.join(folder, name))]
+
+
+def writeToFile(filename, content):
+    ensureDir(filename)
+    file = open(filename, 'w')
+    file.truncate()
+    file.write(content)
+    file.close()
+
+
+def readFromFile(filename):
+    if not os.path.isfile(filename):
+        return ''
+
+    file = open(filename, 'r')
+    result = file.read()
+    file.close()
+    return result
+
+
+def serialize(filename, obj, compressed=False, use_json=False):
+    # json only works for nested dicts
+    ensureDir(filename)
+    if compressed:
+        file = gzip.open(filename, 'w' if use_json else 'wb')
+    else:
+        file = open(filename, 'w' if use_json else 'wb')
+    # dill can dump lambdas, and dill also dumps the class and not only the contents
+    if use_json:
+        json.dump(obj, file)
+    else:
+        dill.dump(obj, file)
+    file.close()
+
+
+class CustomUnpickler(dill.Unpickler):
+    """
+    Search for classes also in the bmdal_reg module
+    in case the objects have been pickled before moving the code to a dedicated bmdal_reg module.
+    See https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory
+    """
+    def find_class(self, module, name):
+        try:
+            return super().find_class(module, name)
+        except:
+            pass
+
+        return super().find_class('bmdal_reg.' + module, name)
+
+
+def deserialize(filename, compressed=False, use_json=False):
+    # json only works for nested dicts
+    if compressed:
+        file = gzip.open(filename, 'r' if use_json else 'rb')
+    else:
+        file = open(filename, 'r' if use_json else 'rb')
+    if use_json:
+        result = json.load(file)
+    else:
+        # result = dill.load(file)
+        result = CustomUnpickler(file).load()
+    file.close()
+    return result
+
+
+def copyFile(src, dst):
+    ensureDir(dst)
+    shutil.copyfile(src, dst)
+
+
+def nsmallest(n, inputList):
+    return heapq.nsmallest(n, inputList)[-1]
+
+
+def identity(x):
+    return x
+
+
+def set_none_except(lst, idxs):
+    for i in range(len(lst)):
+        if i not in idxs:
+            lst[i] = None
+
+
+def argsort(lst, key: Optional[Callable] = None):
+    # from https://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python
+    if key is None:
+        range_key = lst.__getitem__
+    else:
+        range_key = lambda i, f=key: f(lst[i])
+    return sorted(range(len(lst)), key=range_key)
+
+
+def dict_argsort(dict_to_sort: dict, key: Optional[Callable] = None):
+    keys = list(dict_to_sort.keys())
+    values = list(dict_to_sort.values())
+    perm = argsort(values, key=key)
+    return [keys[i] for i in perm]
+
+
+def join_dicts(*dicts):
+    # Attention: arguments do not commute since later dicts can override entries from earlier dicts!
+    result = copy.copy(dicts[0])
+    for d in dicts[1:]:
+        result.update(d)
+    return result
+
+
+def update_dict(d: dict, update: Optional[dict] = None, remove_keys: Optional[Union[object, List[object]]] = None):
+    d = copy.copy(d)
+    if update is not None:
+        d.update(update)
+    if remove_keys is not None:
+        if isinstance(remove_keys, List):
+            for key in remove_keys:
+                d.pop(key)
+        else:
+            d.pop(remove_keys)
+    return d
+
+
+def pretty_table_str(str_table):
+    max_lens = [np.max([len(row[i]) for row in str_table])for i in range(len(str_table[0]))]
+    whole_str = ''
+    for row in str_table:
+        for i, entry in enumerate(row):
+            whole_str += entry + (' ' * (max_lens[i] - len(entry)))
+        whole_str += '\n'
+    return whole_str[:-1]  # remove last newline
+
+
+def prod(it: Iterable, id=None):
+    result = None
+    for value in it:
+        if result is None:
+            result = value
+        else:
+            result = result * value
+    if result is None:
+        if id is None:
+            raise ValueError(f'Cannot compute empty product without identity element')
+        else:
+            return id
+    return result
+
+
+def all_equal(it: Iterable):
+    # see https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical
+    g = itertools.groupby(it)  # iterates over unique elements
+    try:
+        next(g)
+        next(g)
+    except StopIteration:
+        return True
+    return False
+
+
+class Timer:
+    def __init__(self):
+        self.start_time_total = None
+        self.start_time_process = None
+        self.acc_time_total = 0.0
+        self.acc_time_process = 0.0
+
+    def start(self):
+        if self.start_time_total is None or self.start_time_process is None:
+            self.start_time_total = timeit.default_timer()
+            self.start_time_process = time.process_time()
+
+    def pause(self):
+        if self.start_time_total is None or self.start_time_process is None:
+            return  # has already been paused or not been started
+        self.acc_time_total += timeit.default_timer() - self.start_time_total
+        self.acc_time_process += time.process_time() - self.start_time_process
+        self.start_time_total = None
+        self.start_time_process = None
+
+    def get_result_dict(self):
+        return {'total': self.acc_time_total, 'process': self.acc_time_process}
+
+
+class TimePrinter:
+    def __init__(self, desc: str):
+        self.desc = desc
+        self.timer = Timer()
+
+    def __enter__(self):
+        self.timer.start()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.timer.pause()
+        print(f'Time for {self.desc}: {self.timer.get_result_dict()["total"]:g}s')
+
+
+def format_length_s(duration: float) -> str:
+    seconds = int(duration)
+    minutes = seconds // 60
+    seconds -= minutes * 60
+    hours = minutes // 60
+    minutes -= hours * 60
+    days = hours // 24
+    hours -= days * 24
+
+    result = f'{seconds}s'
+    if minutes > 0:
+        result = f'{minutes}m' + result
+    if hours > 0:
+        result = f'{hours}h' + result
+    if days > 0:
+        result = f'{days}d' + result
+
+    return result
+
+
+def format_date_s(time_s: float) -> str:
+    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_s))
+
+
+def get_batch_intervals(n_total: int, batch_size: int) -> List[Tuple[int, int]]:
+    boundaries = [i * batch_size for i in range(1 + n_total // batch_size)]
+    if boundaries[-1] != n_total:
+        boundaries.append(n_total)
+    return [(start, stop) for start, stop in zip(boundaries[:-1], boundaries[1:])]
+
+
+def last_mod_time_recursive(path: str):
+    # see https://stackoverflow.com/questions/29685069/get-the-last-modified-date-of-a-directory-including-subdirectories-using-pytho
+    import os
+    return max(os.path.getmtime(root) for root, _, _ in os.walk(path))
-- 
GitLab