From d66afd0cc7ecbb66d0c4376ebf3014a6e995b819 Mon Sep 17 00:00:00 2001
From: Tizian Wenzel <wenzeltn@nbanm02.mathematik.uni-stuttgart.de>
Date: Thu, 8 Jun 2023 16:19:35 +0200
Subject: [PATCH] Final updates.

---
 requirements.txt                 |  8 ++++++++
 section_4.1_compute_visualize.py |  7 +++----
 section_4.2_compute.py           |  8 +++-----
 section_4.2_visualize.py         | 17 ++++++++---------
 section_4.3_compute.py           | 18 ++++++++----------
 section_4.3_visualize.py         | 11 +++++++----
 utils_code/dataset_collection.py |  2 +-
 utils_code/main_function.py      |  2 +-
 utils_code/optimized_kernel.py   |  2 +-
 utils_data/custom_paths.py       |  2 +-
 10 files changed, 41 insertions(+), 36 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7810fec
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+openml==0.13.1
+scipy==1.7.3
+scikit-learn==1.0.2
+requests==2.31.0
+pandas==1.3.5
+mat4py==0.5.0
+dill==0.3.6 
+matplotlib==3.5.3
diff --git a/section_4.1_compute_visualize.py b/section_4.1_compute_visualize.py
index 56acdfe..6bcee1e 100644
--- a/section_4.1_compute_visualize.py
+++ b/section_4.1_compute_visualize.py
@@ -4,9 +4,8 @@
 # on the unit cube" to produce the plots within Figure 2.
 
 
-import torch
-from utils.main_function import run_everything, run_cross_validation
-from utils.hyperparameters import dic_hyperparams
+from utils_code.main_function import run_everything, run_cross_validation
+from utils_code.hyperparameters import dic_hyperparams
 
 import numpy as np
 from matplotlib import pyplot as plt
@@ -50,7 +49,7 @@ array_eps, array_cv_f, array_cv_f_val, _, list_timings_1L = run_cross_validation
 
 
 ## Store in matlab for beautiful tikzfigure plots
-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'
 
 os.makedirs(path_for_results, exist_ok=True)
 io.savemat(path_for_results + name_dataset + '.mat',
diff --git a/section_4.2_compute.py b/section_4.2_compute.py
index a66082e..a2054d6 100644
--- a/section_4.2_compute.py
+++ b/section_4.2_compute.py
@@ -15,10 +15,8 @@ import numpy as np
 np.random.seed(1)
 
 
-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']
 
 
 ## Loop over reruns and datasets
@@ -55,7 +53,7 @@ for idx_indices in [0, 1, 2, 3, 4]:
 
 
         ## Store in matlab for beautiful tikzfigure plots
-        path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/'))
+        path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'
 
         os.makedirs(path_for_results, exist_ok=True)
         io.savemat(path_for_results + name_dataset + '_{}'.format(idx_indices) + '.mat',
diff --git a/section_4.2_visualize.py b/section_4.2_visualize.py
index d78652c..457a66f 100644
--- a/section_4.2_visualize.py
+++ b/section_4.2_visualize.py
@@ -9,19 +9,17 @@ from matplotlib import pyplot as plt
 from scipy import io
 import os
 import scipy
-import numpy as np
+import matplotlib
 
 
 np.random.seed(1)
 
 
 ## Some settings
-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']
 
-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'results_5reruns/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'
 
 
 ## Initialize dictionaries to store several quantities
@@ -82,11 +80,12 @@ indices_sorted = np.argsort(array_ratio)
 ## Print the calculated ratio: This shows, when 2L is superior
 for idx_sorted in indices_sorted:
     print('{:20}'.format(list_datasets[idx_sorted]), np.round(array_ratio[idx_sorted], 5))
-indices_sorted = indices_sorted[:-3]     # cut off last three datasets
 
 
 ## Visualization of the ratio of singular values compared to sum of all singular values
-R = np.linspace(0, 1, int(1.0*len(list_datasets)))
+matplotlib.use('TKAgg')
+
+R = np.linspace(0, 1, int(1.5*len(list_datasets)))
 array_color=plt.cm.hsv(R)
 
 dic_to_store = {}
@@ -106,7 +105,7 @@ for idx_sorted in indices_sorted[::-1]:
 
     dic_to_store[name_dataset] = array
 plt.legend(list_legend)
-
+plt.draw()
 
 ## Store in matlab for beautiful tikzfigure plots
 # io.savemat('dic_singular_value_ratio.mat', dic_to_store)
diff --git a/section_4.3_compute.py b/section_4.3_compute.py
index 0618091..3f5b78e 100644
--- a/section_4.3_compute.py
+++ b/section_4.3_compute.py
@@ -9,9 +9,9 @@ import numpy as np
 from scipy.stats import ortho_group  # Requires version 0.18 of scipy
 import pickle
 
-from utils.dataset_collection import Dataset
-from utils.hyperparameters import dic_hyperparams
-from utils.main_function import run_everything
+from utils_code.dataset_collection import Dataset
+from utils_code.hyperparameters import dic_hyperparams
+from utils_code.main_function import run_everything
 
 
 np.random.seed(1)
@@ -20,16 +20,14 @@ np.random.seed(1)
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
 
-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/'
 
 
 flag_gaussian = False
 n_reruns = 5
 
-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']
 
 
 ## Loop over all datasets
@@ -44,7 +42,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets):
     list_A_start = [np.eye(X.shape[1]) for _ in range(n_reruns)]
 
     # Go only for a fixed set of indices - do not aim at error bars plot!
-    list_idx = [1, 2, 3, 4]
+    list_idx = [0, 1, 2, 3, 4]
 
     ## Loop to compute and store all initial and final matrices for the different nfolds optimizations
     dic_results = {}
@@ -60,7 +58,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets):
         ## Run everything for 2L
         for nfold in list_nfolds:
 
-            A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep = run_everything(
+            A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep, _ = run_everything(
                 name_dataset,
                 hyperparameter.maxIter_vkoga, hyperparameter.N_points,
                 hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,
diff --git a/section_4.3_visualize.py b/section_4.3_visualize.py
index 3743e45..4aa9ac2 100644
--- a/section_4.3_visualize.py
+++ b/section_4.3_visualize.py
@@ -19,12 +19,15 @@ np.random.seed(1)
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
 
-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/'
 
 
 ## First, collect results from all files in a common dictionary
 dic_results = {}        # dic_results --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep]
 for idx_file, file in enumerate(os.listdir(path_for_results)):
+    if 'README' in file:
+        continue
+
     idx_index = int(file.split('_')[2])
     name_dataset = ('_'.join(file.split('_')[3:])).split('.')[:-1][0]
     print(name_dataset)
@@ -157,9 +160,9 @@ for idx_dataset, name_dataset in enumerate(list(dic_results.keys())):
     plt.yscale('log')
 
 
-    ## Store in matlab for beautiful tikzfigure plots
-    io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment)
-    io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray)
+    # ## Store in matlab for beautiful tikzfigure plots
+    # io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment)
+    # io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray)
 
     if idx_dataset > 20:
         break
diff --git a/utils_code/dataset_collection.py b/utils_code/dataset_collection.py
index 057e898..99fd48a 100644
--- a/utils_code/dataset_collection.py
+++ b/utils_code/dataset_collection.py
@@ -46,7 +46,7 @@ class Dataset():
     def example_holzmuller(self, ds_name):
         """Data set from David Holzmüller paper of batch active learning."""
 
-        path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/'))
+        path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + '/'
 
         X = np.load(path + ds_name + '/X.npy')
         y = np.load(path + ds_name + '/y.npy')
diff --git a/utils_code/main_function.py b/utils_code/main_function.py
index 2a2b318..dbc379f 100644
--- a/utils_code/main_function.py
+++ b/utils_code/main_function.py
@@ -19,7 +19,7 @@ from datetime import datetime
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]
 
-path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/'))
+path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) + '/'
 
 
 # Main functiono to run kernel optimization with subsequent vkoga
diff --git a/utils_code/optimized_kernel.py b/utils_code/optimized_kernel.py
index fb3aae3..1d9bfa4 100644
--- a/utils_code/optimized_kernel.py
+++ b/utils_code/optimized_kernel.py
@@ -1,6 +1,6 @@
 from torch import nn
 import torch
-from utils.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2
+from utils_code.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2
 import numpy as np
 
 
diff --git a/utils_data/custom_paths.py b/utils_data/custom_paths.py
index 295b548..6c228b5 100644
--- a/utils_data/custom_paths.py
+++ b/utils_data/custom_paths.py
@@ -3,7 +3,7 @@ import os
 # This file allows to configure where to save data, results, plots etc.
 class CustomPaths:
     # path where downloaded data sets will be saved
-    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/'))
+    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + '/'
 
     # path where benchmark results will be saved
     results_path = 'results'
-- 
GitLab