diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7810fec01921a62629f81df6e079a2c201d230de --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +openml==0.13.1 +scipy==1.7.3 +scikit-learn==1.0.2 +requests==2.31.0 +pandas==1.3.5 +mat4py==0.5.0 +dill==0.3.6 +matplotlib==3.5.3 diff --git a/section_4.1_compute_visualize.py b/section_4.1_compute_visualize.py index 56acdfedbdb03452078ca70458cc746380248a06..6bcee1e53809dddb8d69a9e26053e11f81975925 100644 --- a/section_4.1_compute_visualize.py +++ b/section_4.1_compute_visualize.py @@ -4,9 +4,8 @@ # on the unit cube" to produce the plots within Figure 2. -import torch -from utils.main_function import run_everything, run_cross_validation -from utils.hyperparameters import dic_hyperparams +from utils_code.main_function import run_everything, run_cross_validation +from utils_code.hyperparameters import dic_hyperparams import numpy as np from matplotlib import pyplot as plt @@ -50,7 +49,7 @@ array_eps, array_cv_f, array_cv_f_val, _, list_timings_1L = run_cross_validation ## Store in matlab for beautiful tikzfigure plots -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' os.makedirs(path_for_results, exist_ok=True) io.savemat(path_for_results + name_dataset + '.mat', diff --git a/section_4.2_compute.py b/section_4.2_compute.py index a66082e1649000c024ab671c5f4d54842b85921e..a2054d6192b77c597bc2b220339da1ee9b41bbe9 100644 --- a/section_4.2_compute.py +++ b/section_4.2_compute.py @@ -15,10 +15,8 @@ import numpy as np np.random.seed(1) -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] ## Loop over reruns and datasets @@ -55,7 +53,7 @@ for idx_indices in [0, 1, 2, 3, 4]: ## Store in matlab for beautiful tikzfigure plots - path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' os.makedirs(path_for_results, exist_ok=True) io.savemat(path_for_results + name_dataset + '_{}'.format(idx_indices) + '.mat', diff --git a/section_4.2_visualize.py b/section_4.2_visualize.py index d78652cdbafd25fe868aa516437c8af25b4e14de..457a66f1fa5bb5dd5ff3e74848328c1cd75364d5 100644 --- a/section_4.2_visualize.py +++ b/section_4.2_visualize.py @@ -9,19 +9,17 @@ from matplotlib import pyplot as plt from scipy import io import os import scipy -import numpy as np +import matplotlib np.random.seed(1) ## Some settings -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'results_5reruns/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' ## Initialize dictionaries to store several quantities @@ -82,11 +80,12 @@ indices_sorted = np.argsort(array_ratio) ## Print the calculated ratio: This shows, when 2L is superior for idx_sorted in indices_sorted: print('{:20}'.format(list_datasets[idx_sorted]), np.round(array_ratio[idx_sorted], 5)) -indices_sorted = indices_sorted[:-3] # cut off last three datasets ## Visualization of the ratio of singular values compared to sum of all singular values -R = np.linspace(0, 1, int(1.0*len(list_datasets))) +matplotlib.use('TKAgg') + +R = np.linspace(0, 1, int(1.5*len(list_datasets))) array_color=plt.cm.hsv(R) dic_to_store = {} @@ -106,7 +105,7 @@ for idx_sorted in indices_sorted[::-1]: dic_to_store[name_dataset] = array plt.legend(list_legend) - +plt.draw() ## Store in matlab for beautiful tikzfigure plots # io.savemat('dic_singular_value_ratio.mat', dic_to_store) diff --git a/section_4.3_compute.py b/section_4.3_compute.py index 06180919ebf0777e4f6d785efba86a5810ee46ee..3f5b78e350f0a779e742de2b16322d4107cea20f 100644 --- a/section_4.3_compute.py +++ b/section_4.3_compute.py @@ -9,9 +9,9 @@ import numpy as np from scipy.stats import ortho_group # Requires version 0.18 of scipy import pickle -from utils.dataset_collection import Dataset -from utils.hyperparameters import dic_hyperparams -from utils.main_function import run_everything +from utils_code.dataset_collection import Dataset +from utils_code.hyperparameters import dic_hyperparams +from utils_code.main_function import run_everything np.random.seed(1) @@ -20,16 +20,14 @@ np.random.seed(1) ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/' flag_gaussian = False n_reruns = 5 -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] ## Loop over all datasets @@ -44,7 +42,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets): list_A_start = [np.eye(X.shape[1]) for _ in range(n_reruns)] # Go only for a fixed set of indices - do not aim at error bars plot! - list_idx = [1, 2, 3, 4] + list_idx = [0, 1, 2, 3, 4] ## Loop to compute and store all initial and final matrices for the different nfolds optimizations dic_results = {} @@ -60,7 +58,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets): ## Run everything for 2L for nfold in list_nfolds: - A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep = run_everything( + A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep, _ = run_everything( name_dataset, hyperparameter.maxIter_vkoga, hyperparameter.N_points, hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga, diff --git a/section_4.3_visualize.py b/section_4.3_visualize.py index 3743e4505f7aff6187a6a4452f4dc48afc1da0fa..4aa9ac22dfa3ba5bdaf23c66e181f65fc4a8d033 100644 --- a/section_4.3_visualize.py +++ b/section_4.3_visualize.py @@ -19,12 +19,15 @@ np.random.seed(1) ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/' ## First, collect results from all files in a common dictionary dic_results = {} # dic_results --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep] for idx_file, file in enumerate(os.listdir(path_for_results)): + if 'README' in file: + continue + idx_index = int(file.split('_')[2]) name_dataset = ('_'.join(file.split('_')[3:])).split('.')[:-1][0] print(name_dataset) @@ -157,9 +160,9 @@ for idx_dataset, name_dataset in enumerate(list(dic_results.keys())): plt.yscale('log') - ## Store in matlab for beautiful tikzfigure plots - io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment) - io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray) + # ## Store in matlab for beautiful tikzfigure plots + # io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment) + # io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray) if idx_dataset > 20: break diff --git a/utils_code/dataset_collection.py b/utils_code/dataset_collection.py index 057e898a9dfe362e282ad7773efe017468109e06..99fd48a2a3d1f33d148ddd07cbe0191f81f94bf7 100644 --- a/utils_code/dataset_collection.py +++ b/utils_code/dataset_collection.py @@ -46,7 +46,7 @@ class Dataset(): def example_holzmuller(self, ds_name): """Data set from David Holzmüller paper of batch active learning.""" - path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + '/' X = np.load(path + ds_name + '/X.npy') y = np.load(path + ds_name + '/y.npy') diff --git a/utils_code/main_function.py b/utils_code/main_function.py index 2a2b31824bc20e1c1f0aa283c25acb7c4f453407..dbc379f4d5a798780b8b45fc69ea16f65c5c30f5 100644 --- a/utils_code/main_function.py +++ b/utils_code/main_function.py @@ -19,7 +19,7 @@ from datetime import datetime ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) +path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) + '/' # Main functiono to run kernel optimization with subsequent vkoga diff --git a/utils_code/optimized_kernel.py b/utils_code/optimized_kernel.py index fb3aae3ba8d7d965c9912dcded4149313f82a317..1d9bfa4d9a7517a91f3150ebdbbe877cd10ef7c0 100644 --- a/utils_code/optimized_kernel.py +++ b/utils_code/optimized_kernel.py @@ -1,6 +1,6 @@ from torch import nn import torch -from utils.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2 +from utils_code.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2 import numpy as np diff --git a/utils_data/custom_paths.py b/utils_data/custom_paths.py index 295b548e4ea8b836db3e7f59b1f2926a0c432342..6c228b5393735a47054f15cbed86642a126e0aa5 100644 --- a/utils_data/custom_paths.py +++ b/utils_data/custom_paths.py @@ -3,7 +3,7 @@ import os # This file allows to configure where to save data, results, plots etc. class CustomPaths: # path where downloaded data sets will be saved - data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + '/' # path where benchmark results will be saved results_path = 'results'