From d66afd0cc7ecbb66d0c4376ebf3014a6e995b819 Mon Sep 17 00:00:00 2001 From: Tizian Wenzel <wenzeltn@nbanm02.mathematik.uni-stuttgart.de> Date: Thu, 8 Jun 2023 16:19:35 +0200 Subject: [PATCH] Final updates. --- requirements.txt | 8 ++++++++ section_4.1_compute_visualize.py | 7 +++---- section_4.2_compute.py | 8 +++----- section_4.2_visualize.py | 17 ++++++++--------- section_4.3_compute.py | 18 ++++++++---------- section_4.3_visualize.py | 11 +++++++---- utils_code/dataset_collection.py | 2 +- utils_code/main_function.py | 2 +- utils_code/optimized_kernel.py | 2 +- utils_data/custom_paths.py | 2 +- 10 files changed, 41 insertions(+), 36 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7810fec --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +openml==0.13.1 +scipy==1.7.3 +scikit-learn==1.0.2 +requests==2.31.0 +pandas==1.3.5 +mat4py==0.5.0 +dill==0.3.6 +matplotlib==3.5.3 diff --git a/section_4.1_compute_visualize.py b/section_4.1_compute_visualize.py index 56acdfe..6bcee1e 100644 --- a/section_4.1_compute_visualize.py +++ b/section_4.1_compute_visualize.py @@ -4,9 +4,8 @@ # on the unit cube" to produce the plots within Figure 2. -import torch -from utils.main_function import run_everything, run_cross_validation -from utils.hyperparameters import dic_hyperparams +from utils_code.main_function import run_everything, run_cross_validation +from utils_code.hyperparameters import dic_hyperparams import numpy as np from matplotlib import pyplot as plt @@ -50,7 +49,7 @@ array_eps, array_cv_f, array_cv_f_val, _, list_timings_1L = run_cross_validation ## Store in matlab for beautiful tikzfigure plots -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' os.makedirs(path_for_results, exist_ok=True) io.savemat(path_for_results + name_dataset + '.mat', diff --git a/section_4.2_compute.py b/section_4.2_compute.py index a66082e..a2054d6 100644 --- a/section_4.2_compute.py +++ b/section_4.2_compute.py @@ -15,10 +15,8 @@ import numpy as np np.random.seed(1) -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] ## Loop over reruns and datasets @@ -55,7 +53,7 @@ for idx_indices in [0, 1, 2, 3, 4]: ## Store in matlab for beautiful tikzfigure plots - path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' os.makedirs(path_for_results, exist_ok=True) io.savemat(path_for_results + name_dataset + '_{}'.format(idx_indices) + '.mat', diff --git a/section_4.2_visualize.py b/section_4.2_visualize.py index d78652c..457a66f 100644 --- a/section_4.2_visualize.py +++ b/section_4.2_visualize.py @@ -9,19 +9,17 @@ from matplotlib import pyplot as plt from scipy import io import os import scipy -import numpy as np +import matplotlib np.random.seed(1) ## Some settings -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'results_5reruns/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/' ## Initialize dictionaries to store several quantities @@ -82,11 +80,12 @@ indices_sorted = np.argsort(array_ratio) ## Print the calculated ratio: This shows, when 2L is superior for idx_sorted in indices_sorted: print('{:20}'.format(list_datasets[idx_sorted]), np.round(array_ratio[idx_sorted], 5)) -indices_sorted = indices_sorted[:-3] # cut off last three datasets ## Visualization of the ratio of singular values compared to sum of all singular values -R = np.linspace(0, 1, int(1.0*len(list_datasets))) +matplotlib.use('TKAgg') + +R = np.linspace(0, 1, int(1.5*len(list_datasets))) array_color=plt.cm.hsv(R) dic_to_store = {} @@ -106,7 +105,7 @@ for idx_sorted in indices_sorted[::-1]: dic_to_store[name_dataset] = array plt.legend(list_legend) - +plt.draw() ## Store in matlab for beautiful tikzfigure plots # io.savemat('dic_singular_value_ratio.mat', dic_to_store) diff --git a/section_4.3_compute.py b/section_4.3_compute.py index 0618091..3f5b78e 100644 --- a/section_4.3_compute.py +++ b/section_4.3_compute.py @@ -9,9 +9,9 @@ import numpy as np from scipy.stats import ortho_group # Requires version 0.18 of scipy import pickle -from utils.dataset_collection import Dataset -from utils.hyperparameters import dic_hyperparams -from utils.main_function import run_everything +from utils_code.dataset_collection import Dataset +from utils_code.hyperparameters import dic_hyperparams +from utils_code.main_function import run_everything np.random.seed(1) @@ -20,16 +20,14 @@ np.random.seed(1) ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/' flag_gaussian = False n_reruns = 5 -list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds', - 'stock', 'kegg_undir_uci', 'online_video', - 'wecs', 'mlr_knn_rng', 'query_agg_count', - 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs'] +list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video', + 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network'] ## Loop over all datasets @@ -44,7 +42,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets): list_A_start = [np.eye(X.shape[1]) for _ in range(n_reruns)] # Go only for a fixed set of indices - do not aim at error bars plot! - list_idx = [1, 2, 3, 4] + list_idx = [0, 1, 2, 3, 4] ## Loop to compute and store all initial and final matrices for the different nfolds optimizations dic_results = {} @@ -60,7 +58,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets): ## Run everything for 2L for nfold in list_nfolds: - A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep = run_everything( + A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep, _ = run_everything( name_dataset, hyperparameter.maxIter_vkoga, hyperparameter.N_points, hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga, diff --git a/section_4.3_visualize.py b/section_4.3_visualize.py index 3743e45..4aa9ac2 100644 --- a/section_4.3_visualize.py +++ b/section_4.3_visualize.py @@ -19,12 +19,15 @@ np.random.seed(1) ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) +path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/' ## First, collect results from all files in a common dictionary dic_results = {} # dic_results --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep] for idx_file, file in enumerate(os.listdir(path_for_results)): + if 'README' in file: + continue + idx_index = int(file.split('_')[2]) name_dataset = ('_'.join(file.split('_')[3:])).split('.')[:-1][0] print(name_dataset) @@ -157,9 +160,9 @@ for idx_dataset, name_dataset in enumerate(list(dic_results.keys())): plt.yscale('log') - ## Store in matlab for beautiful tikzfigure plots - io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment) - io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray) + # ## Store in matlab for beautiful tikzfigure plots + # io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment) + # io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray) if idx_dataset > 20: break diff --git a/utils_code/dataset_collection.py b/utils_code/dataset_collection.py index 057e898..99fd48a 100644 --- a/utils_code/dataset_collection.py +++ b/utils_code/dataset_collection.py @@ -46,7 +46,7 @@ class Dataset(): def example_holzmuller(self, ds_name): """Data set from David Holzmüller paper of batch active learning.""" - path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + '/' X = np.load(path + ds_name + '/X.npy') y = np.load(path + ds_name + '/y.npy') diff --git a/utils_code/main_function.py b/utils_code/main_function.py index 2a2b318..dbc379f 100644 --- a/utils_code/main_function.py +++ b/utils_code/main_function.py @@ -19,7 +19,7 @@ from datetime import datetime ## Some settings list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)] -path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) +path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) + '/' # Main functiono to run kernel optimization with subsequent vkoga diff --git a/utils_code/optimized_kernel.py b/utils_code/optimized_kernel.py index fb3aae3..1d9bfa4 100644 --- a/utils_code/optimized_kernel.py +++ b/utils_code/optimized_kernel.py @@ -1,6 +1,6 @@ from torch import nn import torch -from utils.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2 +from utils_code.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2 import numpy as np diff --git a/utils_data/custom_paths.py b/utils_data/custom_paths.py index 295b548..6c228b5 100644 --- a/utils_data/custom_paths.py +++ b/utils_data/custom_paths.py @@ -3,7 +3,7 @@ import os # This file allows to configure where to save data, results, plots etc. class CustomPaths: # path where downloaded data sets will be saved - data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + '/' # path where benchmark results will be saved results_path = 'results' -- GitLab