diff --git a/download_data.py b/download_data.py new file mode 100644 index 0000000000000000000000000000000000000000..2ec0e48d15cc0a9c16f6e0d9f11bd9f895b1206d --- /dev/null +++ b/download_data.py @@ -0,0 +1,5 @@ +from utils_data.download_data import import_all + +if __name__ == '__main__': + import_all() + pass \ No newline at end of file diff --git a/section_4.2_compute.py b/section_4.2_compute.py index d40c028cfdfbae9fc92f59b90ebb7879a6de6c3e..a66082e1649000c024ab671c5f4d54842b85921e 100644 --- a/section_4.2_compute.py +++ b/section_4.2_compute.py @@ -4,8 +4,8 @@ # sets", especially to produce the results for the Figures 3-5. -from utils.main_function import run_everything, run_cross_validation -from utils.hyperparameters import dic_hyperparams +from utils_code.main_function import run_everything, run_cross_validation +from utils_code.hyperparameters import dic_hyperparams from scipy import io import os diff --git a/setup.sh b/setup.sh index ddd3cd6cfbae20d02328d5c8268b4f1cdf7de2b9..3b14bc0ec6e66321a66d60a748b1e6a47807677b 100755 --- a/setup.sh +++ b/setup.sh @@ -16,12 +16,7 @@ source venv/bin/activate pip install --upgrade pip pip3 install --ignore-installed torch==1.9.1+cpu torchvision==0.10.1+cpu torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html -pip3 install scipy==1.7.3 -pip3 install scikit-learn==1.0.2 - - -# Install further stuff -pip install matplotlib +pip3 install -r requirements.txt # pip freeze > requirements.txt diff --git a/utils/cv_rippa_ext.py b/utils_code/cv_rippa_ext.py similarity index 100% rename from utils/cv_rippa_ext.py rename to utils_code/cv_rippa_ext.py diff --git a/utils/dataset_collection.py b/utils_code/dataset_collection.py similarity index 100% rename from utils/dataset_collection.py rename to utils_code/dataset_collection.py diff --git a/utils/hyperparameters.py b/utils_code/hyperparameters.py similarity index 100% rename from utils/hyperparameters.py rename to utils_code/hyperparameters.py diff --git a/utils/kernels.py b/utils_code/kernels.py similarity index 100% rename from utils/kernels.py rename to utils_code/kernels.py diff --git a/utils/main_function.py b/utils_code/main_function.py similarity index 98% rename from utils/main_function.py rename to utils_code/main_function.py index c1379c9fb5e520844eccb1c583717d249fe0603a..2a2b31824bc20e1c1f0aa283c25acb7c4f453407 100644 --- a/utils/main_function.py +++ b/utils_code/main_function.py @@ -1,16 +1,16 @@ -from utils.optimized_kernel import OptimizedKernel -from utils.dataset_collection import Dataset -from utils.hyperparameters import dic_hyperparams +from utils_code.optimized_kernel import OptimizedKernel +from utils_code.dataset_collection import Dataset +from utils_code.hyperparameters import dic_hyperparams -from utils import tkernels, kernels +from utils_code import tkernels, kernels import torch from matplotlib import pyplot as plt import numpy as np import time -from utils.vkoga import VKOGA +from utils_code.vkoga import VKOGA import os from datetime import datetime diff --git a/utils/optimized_kernel.py b/utils_code/optimized_kernel.py similarity index 100% rename from utils/optimized_kernel.py rename to utils_code/optimized_kernel.py diff --git a/utils/tkernels.py b/utils_code/tkernels.py similarity index 100% rename from utils/tkernels.py rename to utils_code/tkernels.py diff --git a/utils/vkoga.py b/utils_code/vkoga.py similarity index 99% rename from utils/vkoga.py rename to utils_code/vkoga.py index 5e7fd4bd6c5b6b3a59f24c5d723f56c2fd87dfaa..28e54e519c22b7f16249c9d2032afc9561c3c49f 100644 --- a/utils/vkoga.py +++ b/utils_code/vkoga.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from utils.kernels import Gaussian +from utils_code.kernels import Gaussian import numpy as np from sklearn.base import BaseEstimator from sklearn.utils.validation import check_X_y, check_array, check_is_fitted diff --git a/utils_data/README.md b/utils_data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f934debb46185d0dd06f337e5240795fa07b593 --- /dev/null +++ b/utils_data/README.md @@ -0,0 +1,7 @@ +Code taken from https://github.com/dholzmueller/bmdal_reg/tree/main +Copyright by David Holzmüller + + + + + diff --git a/utils_data/custom_paths.py b/utils_data/custom_paths.py new file mode 100644 index 0000000000000000000000000000000000000000..587e94b1a53569b3c2b742f5118e7522fea1618c --- /dev/null +++ b/utils_data/custom_paths.py @@ -0,0 +1,26 @@ +# This file allows to configure where to save data, results, plots etc. +class CustomPaths: + # path where downloaded data sets will be saved + data_path = 'data' + # path where benchmark results will be saved + results_path = 'results' + # path where plots and tables will be saved + plots_path = 'plots' + # path where benchmark results can be cached in a more efficient format such that they load faster + cache_path = 'cache' + + +def get_data_path(): + return CustomPaths.data_path + + +def get_results_path(): + return CustomPaths.results_path + + +def get_plots_path(): + return CustomPaths.plots_path + + +def get_cache_path(): + return CustomPaths.cache_path diff --git a/utils_data/data.py b/utils_data/data.py new file mode 100644 index 0000000000000000000000000000000000000000..485836317b1c04f933243c1b94330b55b08db5d1 --- /dev/null +++ b/utils_data/data.py @@ -0,0 +1,35 @@ +from pathlib import Path +from typing import * + +from utils_data import custom_paths +from utils_data import utils + + +class DataInfo: + """ + Represents information about a data set. + """ + def __init__(self, ds_name: str, n_tvp: int, n_test: int, n_features: int, + train_test_split: Optional[int]): + """ + :param ds_name: Name of the data set. + :param n_tvp: Number of train+val+pool samples. + :param n_test: Number of test samples. + :param n_features: Number of input features of the data set. + :param train_test_split: Set to None if the data set does not have a fixed (train+val+pool)-test split. + If this is an int, it is interpreted such that the train+val+pool set are the first train_test_split samples + and the test set are the remaining samples. + """ + # tvp = train+val+pool + self.ds_name = ds_name + self.n_features = n_features + self.n_tvp = n_tvp + self.n_test = n_test + self.n_samples = n_tvp + n_test + self.train_test_split = train_test_split + + def save(self): + """ + Saves this object to the path of the corresponding data set. + """ + utils.serialize(Path(custom_paths.get_data_path()) / 'data' / self.ds_name / 'data_info.pkl', self) \ No newline at end of file diff --git a/utils_data/download_data.py b/utils_data/download_data.py new file mode 100644 index 0000000000000000000000000000000000000000..79497a00d09094d073650a7ef35272380dbbfa4e --- /dev/null +++ b/utils_data/download_data.py @@ -0,0 +1,446 @@ +import pandas as pd +import numpy as np +import requests +import shutil +from typing import * +import openml +import mat4py + +from utils_data import custom_paths +from utils_data import utils +from utils_data.data import DataInfo + + + +def download_if_not_exists(url: str, dest: str): + """ + Simple function for downloading a file from an url if no file at the destination path exists. + :param url: URL of the file to download. + :param dest: Path where to save the downloaded file. + """ + # following https://dzone.com/articles/simple-examples-of-downloading-files-using-python + utils.ensureDir(dest) + if not utils.existsFile(dest): + print('Downloading ' + url, flush=True) + # file = requests.get(url) + # open(dest, 'wb').write(file.content) + r = requests.get(url, stream=True) + with open(dest, 'wb') as f: + print('Progress (dot = 1 MB): ', end='', flush=True) + for ch in r.iter_content(chunk_size=1024**2): + print('.', end='', flush=True) + f.write(ch) + print(flush=True) + + +class PandasTask: + """ + This class represents a task (data set with indicated target variable) given by Pandas DataFrames. + Additionally, a dedicated train-test split can be specified + and the name of the data set needs to be specified for saving. + This class provides a variety of methods for altering the task by different preprocessing methods. + """ + def __init__(self, x_df: pd.DataFrame, y_df: pd.Series, ds_name: str, cat_indicator: Optional[List[bool]] = None, + train_test_split: Optional[int] = None): + """ + :param x_df: DataFrame containing the inputs (covariates). + :param y_df: pd.Series containing the targets. + :param ds_name: Name for saving the data set. + :param cat_indicator: Optional. + One may specify a list of booleans which indicate whether each column of x is a category (True) or not (False). + Otherwise, the column types in x_df will be used to decide whether a column is categorical or not. + :param train_test_split: Optional. An integer can be specified as the index of the first test sample, + if the data set has a dedicated test set part at the end. + """ + if cat_indicator is None: + cat_indicator = [not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]]) for i in range(len(x_df.columns))] + else: + # this is a fix coming from a different codebase + # because category_indicator[0] was False for the dataset MIP-2016-regression + # despite the column being categorical (dtype=object) + cat_indicator = [v or not pd.api.types.is_numeric_dtype(x_df[x_df.columns[i]]) + for i, v in enumerate(cat_indicator)] + if len(x_df.columns) != len(cat_indicator): + raise ValueError('x.shape[1] != len(cat_indicator)') + + self.x_df = x_df # should be (sparse) pd.DataFrame + # should be (sparse) pd.Series (i.e. a single column of a DataFrame) + self.y_df = y_df + self.ds_name = ds_name + + self.cat_cols = [x_df.columns[i] for i in range(len(x_df.columns)) if cat_indicator[i]] + self.cont_cols = [x_df.columns[i] for i in range(len(x_df.columns)) if not cat_indicator[i]] + self.train_test_split = train_test_split + + def get_n_samples(self): + """ + :return: Returns the number of samples (number of rows in the DataFrame). + """ + return len(self.x_df) + + def remove_missing_cont(self): + """ + Removes rows with missing values in continuous columns. + """ + print('removing columns with missing continuous values') + if len(self.cont_cols) == 0: + return # no continuous columns + + not_nan_rows = self.x_df.notna().all(axis=1) + self.x_df = self.x_df.loc[not_nan_rows, :] + self.y_df = self.y_df.loc[not_nan_rows] + + def normalize_regression_y(self): + """ + Centers and standardizes the target variable. + """ + print('normalizing regression y') + y_np = np.asarray(self.y_df) + self.y_df.loc[:] = (y_np - np.mean(y_np)) / (np.std(y_np) + 1e-30) + + def subsample_dfs_(self, dfs: List[pd.DataFrame], max_n_samples: int) -> List[pd.DataFrame]: + """ + Internal method for jointly subsampling multiple Pandas DataFrames of the same number of rows. + :param dfs: Data Frames. + :param max_n_samples: Maximum number of remaining rows. + :return: Returns a List of potentially subsampled Pandas DataFrames. + """ + if len(dfs[0]) <= max_n_samples: + return dfs + print(f'subsampling from {len(dfs[0])} samples to {max_n_samples}') + idxs = np.random.default_rng(12345).permutation(len(dfs[0]))[:max_n_samples] + return [df.iloc[idxs] for df in dfs] + + def subsample(self, max_tvp_samples: int, max_test_samples: int): + """ + Subsamples the data set if necessary to not exceed a given maximum size. + :param max_tvp_samples: Maximum number of train+val+pool samples. + :param max_test_samples: Maximum number of test samples. + """ + if self.train_test_split is not None: + dfs_train = self.subsample_dfs_([self.x_df.loc[:self.train_test_split], + self.y_df.loc[:self.train_test_split]], max_n_samples=max_tvp_samples) + dfs_test = self.subsample_dfs_([self.x_df.loc[self.train_test_split:], + self.y_df.loc[self.train_test_split:]], max_n_samples=max_test_samples) + self.train_test_split = len(dfs_train[0]) + self.x_df = pd.concat([dfs_train[0], dfs_test[0]], axis=0) + self.y_df = pd.concat([dfs_train[1], dfs_test[1]], axis=0) + else: + dfs = self.subsample_dfs_([self.x_df, self.y_df], max_n_samples=max_tvp_samples + max_test_samples) + self.x_df, self.y_df = dfs[0], dfs[1] + + def remove_constant_columns(self): + """ + Removes columns with only a single value (this could happen after removing NaN values). + """ + # taken from https://stackoverflow.com/questions/20209600/pandas-dataframe-remove-constant-column + non_constant_columns = (self.x_df != self.x_df.iloc[0]).any() + print(f'removing constant columns') + self.x_df = self.x_df.loc[:, non_constant_columns] + self.cat_cols = [key for key in self.cat_cols if key in self.x_df.columns] + self.cont_cols = [key for key in self.cont_cols if key in self.x_df.columns] + + def one_hot_encode(self, max_one_hot_columns: int): + """ + Applies one-hot encoding to categorical columns. + :param max_one_hot_columns: Maximal number of allowed one-hot encoded columns. + If more one-hot encoded columns would be generated, + the categorical columns with the largest number of categories are not one-hot encoded. + """ + cat_cols_with_size = [(col_name, self.x_df.loc[:, col_name].nunique()) + for i, col_name in enumerate(self.cat_cols)] + if len(cat_cols_with_size) == 0: + return # nothing to encode + print('one-hot encoding columns') + cat_cols_with_size.sort(key=lambda t: t[1]) # sort by size of category + max_cat_size = cat_cols_with_size[-1][1] + new_col_sum = 0 + for key, sz in cat_cols_with_size: + new_col_sum += sz + if new_col_sum > max_one_hot_columns: + max_cat_size = sz-1 + break + + new_cat_cols = [] + + for key, sz in cat_cols_with_size: + if sz <= max_cat_size: + print(f'one-hot encoding column {key} with {sz} unique elements') + # following https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python + # https://queirozf.com/entries/one-hot-encoding-a-feature-on-a-pandas-dataframe-an-example + col = self.x_df[key].astype('category') + dummies = pd.get_dummies(col, prefix=f'{key}_onehot_', drop_first=(sz == 2), dummy_na=True, + dtype=np.int32) + self.cont_cols.extend(list(dummies.columns)) + self.x_df.drop([key], axis=1, inplace=True) + self.x_df = pd.concat([self.x_df, dummies], axis=1) + else: + new_cat_cols.append(key) + print(f'categorical column {key} with {sz} unique values is not one-hot encoded due to size constraints') + + self.cat_cols = new_cat_cols + + def save(self, n_test: int): + """ + Saves the data set in the folder f'{custom_paths.get_data_path()}/data/{self.ds_name}'. + :param n_test: Desired number of test samples. + """ + folder = f'{custom_paths.get_data_path()}/data/{self.ds_name}' + x_cont = np.array(self.x_df.reindex(columns=self.cont_cols), dtype=np.float32) + y = np.array(self.y_df, dtype=np.float32) + n_tvp = self.get_n_samples() - n_test + data_info = DataInfo(ds_name=self.ds_name, n_tvp=n_tvp, n_test=n_test, + n_features=x_cont.shape[1], + train_test_split=self.train_test_split) + + utils.serialize(f'{folder}/data_info.pkl', data_info) + np.save(f'{folder}/X.npy', x_cont) + np.save(f'{folder}/y.npy', y[:, None]) + + # ds = DictDataset({'x_cont': torch.as_tensor(x_cont), 'x_cat': torch.as_tensor(x_cat), + # 'y': torch.as_tensor(y[:, None])}, + # {'x_cont': TensorInfo(feat_shape=[x_cont.shape[-1]]), + # 'x_cat': TensorInfo(cat_sizes=cat_sizes), + # 'y': TensorInfo(cat_sizes=[self.get_n_classes()])}) + # task_info = TaskInfo.from_ds(task_desc, ds) + # return Task(task_info, ds) + + @staticmethod + def from_openml_task_id(task_id: int, ds_name: str): + """ + Creates a PandasTask from an OpenML task. + If more options are desired, we refer to PandasTask.from_openml_dataset_id() instead. + :param task_id: OpenML task id. + :param ds_name: Short name of the data set for saving. + :return: Returns a PandasTask representing the OpenML task. + """ + task = openml.tasks.get_task(task_id, download_data=False) + return PandasTask.from_openml_dataset_id(task.dataset_id, ds_name, task.target_name) + + @staticmethod + def from_openml_dataset_id(dataset_id: int, ds_name: str, target: str, + ignore_columns: Optional[List[str]] = None, + use_log_target: bool = False): + """ + Creates a PandasTask from an OpenML data set. + :param dataset_id: OpenML data set id. + :param ds_name: Short name of the data set for saving. + :param target: Name of the target variable. + :param ignore_columns: Optional. List of columns that should be removed. + :param use_log_target: Whether the logarithm should be applied to the target column. + :return: Returns a PandasTask representing the corresponding OpenML data set. + """ + print(f'Importing dataset {ds_name}') + openml.config.set_cache_directory('./openml_cache') + dataset = openml.datasets.get_dataset(dataset_id, download_data=False) + print(f'dataset name: {dataset.name}') + # print(dataset.get_data(dataset_format='dataframe')) + x_df, y_df, cat_indicator, names = dataset.get_data(target=target, + dataset_format='dataframe') + + if ignore_columns is not None: + cat_indicator = [value for col_name, value in zip(x_df.columns, cat_indicator) + if col_name not in ignore_columns] + + for key in ignore_columns or []: + x_df.drop([key], axis=1, inplace=True) + + if use_log_target: + y_df = np.log(y_df) + + print('Imported x_df:\n', x_df) + print('Imported y_df:\n', y_df) + + return PandasTask(x_df, y_df, ds_name, cat_indicator) + + @staticmethod + def from_uci(url: str, ds_name: str, zip_name: str, csv_name: str, target_col_idxs: List[int], + ignore_col_idxs: Optional[List[int]] = None, use_log_target: bool = False, + train_test_boundary: Optional[int] = None, has_header: bool = True, + continuous_nan_columns: Optional[List[int]] = None, + convert_to_cat_columns: Optional[List[int]] = None, + ignore_other_csv_files: bool = False, separator: Optional[str] = None): + """ + Create a PandasTask object from a data set on the UCI repository. + :param url: URL of the data set file. + :param ds_name: Short name of the data set used for saving the data set. + :param zip_name: Target name of the downloaded file. + If the downloaded file is not zip/compressed, i.e., no unzipping is needed, + zip_name should be the same as csv_name. + Otherwise the file with name zip_name will be unzipped to the file with name csv_name. + :param csv_name: Target name of the uncompressed file, see zip_name. + :param target_col_idxs: List of indexes of target columns. Mostly, this will only have one element. + :param ignore_col_idxs: List of indexes of columns to be removed. + :param use_log_target: Whether the logarithm should be applied to the target value. + :param train_test_boundary: Index of the first test sample. + If (as in most cases) there is no dedicated test set, None should be specified. + :param has_header: Whether the downloaded (unzipped) csv file has a row with column names that should be removed. + :param continuous_nan_columns: Optional. List of indexes of numeric columns that can contain NaN values. + This triggers a corresponding Pandas column conversion. + :param convert_to_cat_columns: Optional. + List of indexes of columns that should be converted to categorical type. + :param ignore_other_csv_files: If set to True, + unzipped csv files with file names other than csv_name will be ignored. + If set to False, unzipped csv files with file names other than csv_name + will be interpreted as a dedicated test set, + and the file with name csv_name will be interpreted as the train+val+pool set. + :param separator: Separator in the csv file. Default is ','. + If the file with name csv_name is a tsv file, separator='\t' should be specified. + :return: Returns a PandasTask object. + """ + print(f'Importing dataset {ds_name}') + base_path = custom_paths.get_data_path() + raw_data_folder = f'{base_path}/raw_data/{ds_name}' + zip_file = f'{raw_data_folder}/{zip_name}' + csv_file = f'{raw_data_folder}/{csv_name}' + download_if_not_exists(url, zip_file) + if not utils.existsFile(csv_file): + print('Unpacking zip file...') + shutil.unpack_archive(zip_file, raw_data_folder) + + if separator is None: + separator = ',' + + print('Processing csv data...') + if ignore_other_csv_files: + non_train_files = [] + else: + non_train_files = [file for file in utils.matchFiles(f'{raw_data_folder}/*.csv') if file != csv_file] + df = pd.read_csv(csv_file, header='infer' if has_header else None, sep=separator) + if len(non_train_files) > 0: + train_test_boundary = len(df) + df = pd.concat([df] + [pd.read_csv(file, header='infer' if has_header else None, sep=separator) + for file in non_train_files]) + + if continuous_nan_columns is not None: + for col_idx in continuous_nan_columns: + df.iloc[:, col_idx] = pd.to_numeric(df.iloc[:, col_idx], errors='coerce') + + if convert_to_cat_columns is not None: + for col_idx in convert_to_cat_columns: + df.iloc[:, col_idx] = df.iloc[:, col_idx].astype('category') + + input_column_names = [] + target_columns = [] + for i in range(len(df.columns)): + if i in target_col_idxs: + target_columns.append(df.iloc[:, i].to_numpy().astype(np.float32)) + elif ignore_col_idxs is None or i not in ignore_col_idxs: + input_column_names.append(df.columns[i]) + + y = np.median(np.stack(target_columns, axis=1), axis=1) + if use_log_target: + y = np.log(y) + + # https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe + x_df = df.reindex(columns=input_column_names).reset_index(drop=True) + y_df = pd.DataFrame({'y': y})['y'] + + return PandasTask(x_df, y_df, ds_name, + train_test_split=train_test_boundary) + + +class PandasTaskPreprocessor: + """ + This class allows to preprocess data sets given by PandasTask objects. Various options can be configured. + """ + def __init__(self, min_n_samples: int, max_tvp_samples: int, max_test_samples: int, max_one_hot_columns: int): + """ + :param min_n_samples: Minimum number of samples that a task must have + after removing missing values in continuous columns. + :param max_tvp_samples: Maximum number of samples for the train+val+pool sets. + :param max_test_samples: Maximum number of samples for the test set. + :param max_one_hot_columns: Maximum number of one-hot encoded columns that are allowed. + If more would be generated, the categorical variables with the largest category sizes are removed. + """ + self.min_n_samples = min_n_samples + self.max_tvp_samples = max_tvp_samples + self.max_test_samples = max_test_samples + self.max_one_hot_columns = max_one_hot_columns + + def apply(self, pd_task: PandasTask): + """ + Apply preprocessing to a PandasTask with the options given in the constructor + and save the preprocessed data set under the name specified in the PandasTask. + :param pd_task: PandasTask object holding infomation about the unprocessed data set. + """ + pd_task.remove_missing_cont() + if pd_task.get_n_samples() < self.min_n_samples: + print(f'Task {pd_task.ds_name} has only {pd_task.get_n_samples()} samples after removing missing values, ' + f'hence we discard it') + return + pd_task.subsample(max_tvp_samples=self.max_tvp_samples, max_test_samples=self.max_test_samples) + pd_task.remove_constant_columns() + pd_task.one_hot_encode(self.max_one_hot_columns) + pd_task.normalize_regression_y() + n_samples = pd_task.get_n_samples() + if pd_task.train_test_split is not None: + n_test = n_samples - pd_task.train_test_split + else: + n_test = max(int(0.2 * n_samples), n_samples - self.max_tvp_samples) + pd_task.save(n_test) + + +def get_sarcos_pandas_task() -> PandasTask: + """ + This is a separate function for downloading the sarcos data set, since it is not in the UCI / OpenML repositories. + """ + print(f'Importing dataset sarcos') + base_path = custom_paths.get_data_path() + raw_data_folder = f'{base_path}/raw_data/sarcos' + file_path = f'{raw_data_folder}/sarcos_inv.mat' + download_if_not_exists('http://www.gaussianprocess.org/gpml/data/sarcos_inv.mat', file_path) + # don't download test data since test_data = train_data[::10] + # download_if_not_exists('http://www.gaussianprocess.org/gpml/data/sarcos_inv_test.mat', + # f'{raw_data_folder}/sarcos_inv_test.mat') + data = np.array(mat4py.loadmat(file_path)['sarcos_inv']) + return PandasTask(x_df=pd.DataFrame(data[:, :-7]), y_df=pd.Series(data[:, -7]), ds_name='sarcos') + + +def import_all(): + proc = PandasTaskPreprocessor(min_n_samples=30000, max_tvp_samples=200000, max_test_samples=300000, + max_one_hot_columns=300) + + proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00440/sgemm_product_dataset.zip', + ds_name='sgemm', zip_name='sgemm_product_dataset.zip', csv_name='sgemm_product.csv', + target_col_idxs=[14, 15, 16, 17], use_log_target=True)) + proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip', + ds_name='ct', zip_name='slice_localization_data.zip', csv_name='slice_localization_data.csv', + target_col_idxs=[385], ignore_col_idxs=[0])) + proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00221/Reaction%20Network%20(Undirected).data', + ds_name='kegg_undir_uci', zip_name='kegg_undir_uci.csv', csv_name='kegg_undir_uci.csv', + target_col_idxs=[26], ignore_col_idxs=[0], has_header=False, continuous_nan_columns=[4])) + # only use the Sydney part of the data set (could as well have used Adelaide, Perth or Tasmania) + proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00494/WECs_DataSet.zip', + ds_name='wecs', zip_name='WECs_DataSet.zip', csv_name='WECs_DataSet/Sydney_Data.csv', + target_col_idxs=[48], has_header=False, ignore_other_csv_files=True)) + proc.apply(PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00335/online_video_dataset.zip', + ds_name='online_video', zip_name='online_video_dataset.zip', + csv_name='transcoding_mesurment.tsv', separator='\t', ignore_col_idxs=[0, 20], + target_col_idxs=[21], has_header=True, ignore_other_csv_files=True)) + proc.apply( + PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00493/datasets.zip', + ds_name='query_agg_count', zip_name='datasets.zip', + csv_name='Datasets/Range-Queries-Aggregates.csv', ignore_col_idxs=[0, 6, 7], + target_col_idxs=[5], has_header=True, ignore_other_csv_files=True)) + proc.apply( + PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt', + ds_name='road_network', zip_name='3D_spatial_network.txt', + csv_name='3D_spatial_network.txt', ignore_col_idxs=[0], + target_col_idxs=[3], has_header=False, ignore_other_csv_files=True)) + # note: we use only the testing data here with random splits since the training data is so small + # and the testing data so large + proc.apply( + PandasTask.from_uci('https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data', + ds_name='poker', zip_name='data.csv', csv_name='data.csv', + convert_to_cat_columns=list(range(10)), + target_col_idxs=[10], has_header=False, ignore_other_csv_files=True)) + proc.apply(get_sarcos_pandas_task()) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=1200, ds_name='stock', target='company10')) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42454, ds_name='mlr_knn_rng', target='perf.logloss', + ignore_columns=['perf.mmce'])) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42701, ds_name='methane', target='MM264', + ignore_columns=['MM263', 'MM256'])) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42225, ds_name='diamonds', target='price')) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=564, ds_name='fried', target='Y')) + proc.apply(PandasTask.from_openml_dataset_id(dataset_id=42903, ds_name='protein', target='RMSD')) \ No newline at end of file diff --git a/utils_data/utils.py b/utils_data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..28c1b5a7969300554f5a29adcaeabdc48126c484 --- /dev/null +++ b/utils_data/utils.py @@ -0,0 +1,303 @@ +import os +import os.path +import heapq +import glob +import gzip +import shutil +import copy +import timeit +import time +import numpy as np +from typing import * +import dill +import json +import itertools + + +def select_from_config(config, keys): + selected = {} + for key in keys: + if key in config: + selected[key] = config[key] + return selected + + +def adapt_config(config, **kwargs): + new_config = copy.deepcopy(config) + for key, value in kwargs.items(): + new_config[key] = value + return new_config + + +def existsDir(directory): + if directory != '': + if not os.path.exists(directory): + return False + return True + + +def existsFile(file_path): + return os.path.isfile(file_path) + + +def ensureDir(file_path): + directory = os.path.dirname(file_path) + if directory != '': + if not os.path.exists(directory): + os.makedirs(directory) + + +def matchFiles(file_matcher): + return glob.glob(file_matcher) + + +def newDirname(prefix): + i = 0 + name = prefix + if existsDir(prefix): + while existsDir(prefix + "_" + str(i)): + i += 1 + name = prefix + "_" + str(i) + os.makedirs(name) + return name + + +def getSubfolderNames(folder): + return [os.path.basename(name) + for name in os.listdir(folder) + if os.path.isdir(os.path.join(folder, name))] + + +def getSubfolders(folder): + return [os.path.join(folder, name) + for name in os.listdir(folder) + if os.path.isdir(os.path.join(folder, name))] + + +def writeToFile(filename, content): + ensureDir(filename) + file = open(filename, 'w') + file.truncate() + file.write(content) + file.close() + + +def readFromFile(filename): + if not os.path.isfile(filename): + return '' + + file = open(filename, 'r') + result = file.read() + file.close() + return result + + +def serialize(filename, obj, compressed=False, use_json=False): + # json only works for nested dicts + ensureDir(filename) + if compressed: + file = gzip.open(filename, 'w' if use_json else 'wb') + else: + file = open(filename, 'w' if use_json else 'wb') + # dill can dump lambdas, and dill also dumps the class and not only the contents + if use_json: + json.dump(obj, file) + else: + dill.dump(obj, file) + file.close() + + +class CustomUnpickler(dill.Unpickler): + """ + Search for classes also in the bmdal_reg module + in case the objects have been pickled before moving the code to a dedicated bmdal_reg module. + See https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory + """ + def find_class(self, module, name): + try: + return super().find_class(module, name) + except: + pass + + return super().find_class('bmdal_reg.' + module, name) + + +def deserialize(filename, compressed=False, use_json=False): + # json only works for nested dicts + if compressed: + file = gzip.open(filename, 'r' if use_json else 'rb') + else: + file = open(filename, 'r' if use_json else 'rb') + if use_json: + result = json.load(file) + else: + # result = dill.load(file) + result = CustomUnpickler(file).load() + file.close() + return result + + +def copyFile(src, dst): + ensureDir(dst) + shutil.copyfile(src, dst) + + +def nsmallest(n, inputList): + return heapq.nsmallest(n, inputList)[-1] + + +def identity(x): + return x + + +def set_none_except(lst, idxs): + for i in range(len(lst)): + if i not in idxs: + lst[i] = None + + +def argsort(lst, key: Optional[Callable] = None): + # from https://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python + if key is None: + range_key = lst.__getitem__ + else: + range_key = lambda i, f=key: f(lst[i]) + return sorted(range(len(lst)), key=range_key) + + +def dict_argsort(dict_to_sort: dict, key: Optional[Callable] = None): + keys = list(dict_to_sort.keys()) + values = list(dict_to_sort.values()) + perm = argsort(values, key=key) + return [keys[i] for i in perm] + + +def join_dicts(*dicts): + # Attention: arguments do not commute since later dicts can override entries from earlier dicts! + result = copy.copy(dicts[0]) + for d in dicts[1:]: + result.update(d) + return result + + +def update_dict(d: dict, update: Optional[dict] = None, remove_keys: Optional[Union[object, List[object]]] = None): + d = copy.copy(d) + if update is not None: + d.update(update) + if remove_keys is not None: + if isinstance(remove_keys, List): + for key in remove_keys: + d.pop(key) + else: + d.pop(remove_keys) + return d + + +def pretty_table_str(str_table): + max_lens = [np.max([len(row[i]) for row in str_table])for i in range(len(str_table[0]))] + whole_str = '' + for row in str_table: + for i, entry in enumerate(row): + whole_str += entry + (' ' * (max_lens[i] - len(entry))) + whole_str += '\n' + return whole_str[:-1] # remove last newline + + +def prod(it: Iterable, id=None): + result = None + for value in it: + if result is None: + result = value + else: + result = result * value + if result is None: + if id is None: + raise ValueError(f'Cannot compute empty product without identity element') + else: + return id + return result + + +def all_equal(it: Iterable): + # see https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical + g = itertools.groupby(it) # iterates over unique elements + try: + next(g) + next(g) + except StopIteration: + return True + return False + + +class Timer: + def __init__(self): + self.start_time_total = None + self.start_time_process = None + self.acc_time_total = 0.0 + self.acc_time_process = 0.0 + + def start(self): + if self.start_time_total is None or self.start_time_process is None: + self.start_time_total = timeit.default_timer() + self.start_time_process = time.process_time() + + def pause(self): + if self.start_time_total is None or self.start_time_process is None: + return # has already been paused or not been started + self.acc_time_total += timeit.default_timer() - self.start_time_total + self.acc_time_process += time.process_time() - self.start_time_process + self.start_time_total = None + self.start_time_process = None + + def get_result_dict(self): + return {'total': self.acc_time_total, 'process': self.acc_time_process} + + +class TimePrinter: + def __init__(self, desc: str): + self.desc = desc + self.timer = Timer() + + def __enter__(self): + self.timer.start() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.timer.pause() + print(f'Time for {self.desc}: {self.timer.get_result_dict()["total"]:g}s') + + +def format_length_s(duration: float) -> str: + seconds = int(duration) + minutes = seconds // 60 + seconds -= minutes * 60 + hours = minutes // 60 + minutes -= hours * 60 + days = hours // 24 + hours -= days * 24 + + result = f'{seconds}s' + if minutes > 0: + result = f'{minutes}m' + result + if hours > 0: + result = f'{hours}h' + result + if days > 0: + result = f'{days}d' + result + + return result + + +def format_date_s(time_s: float) -> str: + return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_s)) + + +def get_batch_intervals(n_total: int, batch_size: int) -> List[Tuple[int, int]]: + boundaries = [i * batch_size for i in range(1 + n_total // batch_size)] + if boundaries[-1] != n_total: + boundaries.append(n_total) + return [(start, stop) for start, stop in zip(boundaries[:-1], boundaries[1:])] + + +def last_mod_time_recursive(path: str): + # see https://stackoverflow.com/questions/29685069/get-the-last-modified-date-of-a-directory-including-subdirectories-using-pytho + import os + return max(os.path.getmtime(root) for root, _, _ in os.walk(path))