Final updates.

d66afd0c · Tizian Wenzel · 990c1c06 · d66afd0c · d66afd0c · d66afd0c
Commit d66afd0c authored Jun 8, 2023 by Tizian Wenzel
--- a/requirements.txt
+++ b/requirements.txt
+openml==0.13.1
+scipy==1.7.3
+scikit-learn==1.0.2
+requests==2.31.0
+pandas==1.3.5
+mat4py==0.5.0
+dill==0.3.6 
+matplotlib==3.5.3
--- a/section_4.1_compute_visualize.py
+++ b/section_4.1_compute_visualize.py
@@ -4,9 +4,8 @@
 # on the unit cube" to produce the plots within Figure 2.


-import torch
-from utils.main_function import run_everything, run_cross_validation
-from utils.hyperparameters import dic_hyperparams
+from utils_code.main_function import run_everything, run_cross_validation
+from utils_code.hyperparameters import dic_hyperparams

 import numpy as np
 from matplotlib import pyplot as plt
@@ -50,7 +49,7 @@ array_eps, array_cv_f, array_cv_f_val, _, list_timings_1L = run_cross_validation


 ## Store in matlab for beautiful tikzfigure plots
-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'

 os.makedirs(path_for_results, exist_ok=True)
 io.savemat(path_for_results + name_dataset + '.mat',

--- a/section_4.2_compute.py
+++ b/section_4.2_compute.py
@@ -15,10 +15,8 @@ import numpy as np
 np.random.seed(1)


-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']


 ## Loop over reruns and datasets
@@ -55,7 +53,7 @@ for idx_indices in [0, 1, 2, 3, 4]:


        ## Store in matlab for beautiful tikzfigure plots
-        path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/'))
+        path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'

        os.makedirs(path_for_results, exist_ok=True)
        io.savemat(path_for_results + name_dataset + '_{}'.format(idx_indices) + '.mat',

--- a/section_4.2_visualize.py
+++ b/section_4.2_visualize.py
@@ -9,19 +9,17 @@ from matplotlib import pyplot as plt
 from scipy import io
 import os
 import scipy
-import numpy as np
+import matplotlib


 np.random.seed(1)


 ## Some settings
-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']

-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'results_5reruns/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_5reruns/')) + '/'


 ## Initialize dictionaries to store several quantities
@@ -82,11 +80,12 @@ indices_sorted = np.argsort(array_ratio)
 ## Print the calculated ratio: This shows, when 2L is superior
 for idx_sorted in indices_sorted:
    print('{:20}'.format(list_datasets[idx_sorted]), np.round(array_ratio[idx_sorted], 5))
-indices_sorted = indices_sorted[:-3]     # cut off last three datasets


 ## Visualization of the ratio of singular values compared to sum of all singular values
-R = np.linspace(0, 1, int(1.0*len(list_datasets)))
+matplotlib.use('TKAgg')
+
+R = np.linspace(0, 1, int(1.5*len(list_datasets)))
 array_color=plt.cm.hsv(R)

 dic_to_store = {}
@@ -106,7 +105,7 @@ for idx_sorted in indices_sorted[::-1]:

    dic_to_store[name_dataset] = array
 plt.legend(list_legend)
-
+plt.draw()

 ## Store in matlab for beautiful tikzfigure plots
 # io.savemat('dic_singular_value_ratio.mat', dic_to_store)

--- a/section_4.3_compute.py
+++ b/section_4.3_compute.py
@@ -9,9 +9,9 @@ import numpy as np
 from scipy.stats import ortho_group  # Requires version 0.18 of scipy
 import pickle

-from utils.dataset_collection import Dataset
-from utils.hyperparameters import dic_hyperparams
-from utils.main_function import run_everything
+from utils_code.dataset_collection import Dataset
+from utils_code.hyperparameters import dic_hyperparams
+from utils_code.main_function import run_everything


 np.random.seed(1)
@@ -20,16 +20,14 @@ np.random.seed(1)
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]

-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/'


 flag_gaussian = False
 n_reruns = 5

-list_datasets = ['fried', 'sarcos', 'protein', 'ct', 'diamonds',
-                 'stock', 'kegg_undir_uci', 'online_video',
-                 'wecs', 'mlr_knn_rng', 'query_agg_count',
-                 'sgemm', 'road_network', 'methane', 'poker'] #, 'susy', 'higgs']
+list_datasets = ['fried', 'sarcos', 'ct', 'diamonds', 'stock', 'kegg_undir_uci', 'online_video',
+                 'wecs', 'mlr_knn_rng', 'query_agg_count', 'sgemm', 'road_network']


 ## Loop over all datasets
@@ -44,7 +42,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets):
    list_A_start = [np.eye(X.shape[1]) for _ in range(n_reruns)]

    # Go only for a fixed set of indices - do not aim at error bars plot!
-    list_idx = [1, 2, 3, 4]
+    list_idx = [0, 1, 2, 3, 4]

    ## Loop to compute and store all initial and final matrices for the different nfolds optimizations
    dic_results = {}
@@ -60,7 +58,7 @@ for idx_dataset, name_dataset in enumerate(list_datasets):
        ## Run everything for 2L
        for nfold in list_nfolds:

-            A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep = run_everything(
+            A_start, A_optimized, model, model_vkoga1, model_vkoga2, data, _, array_test_rmse_deep, _ = run_everything(
                name_dataset,
                hyperparameter.maxIter_vkoga, hyperparameter.N_points,
                hyperparameter.noise_level, hyperparameter.reg_para_optim, hyperparameter.reg_para_vkoga,

--- a/section_4.3_visualize.py
+++ b/section_4.3_visualize.py
@@ -19,12 +19,15 @@ np.random.seed(1)
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]

-path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/'))
+path_for_results = os.path.abspath(os.path.join(os.path.dirname(__file__), 'results_stability/')) + '/'


 ## First, collect results from all files in a common dictionary
 dic_results = {}        # dic_results --> name_dataset --> idx_index --> nfold --> [A_start, A_optimized, model_vkoga1.train_hist['f'], array_test_rmse_deep]
 for idx_file, file in enumerate(os.listdir(path_for_results)):
+    if 'README' in file:
+        continue
+
    idx_index = int(file.split('_')[2])
    name_dataset = ('_'.join(file.split('_')[3:])).split('.')[:-1][0]
    print(name_dataset)
@@ -157,9 +160,9 @@ for idx_dataset, name_dataset in enumerate(list(dic_results.keys())):
    plt.yscale('log')


-    ## Store in matlab for beautiful tikzfigure plots
-    io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment)
-    io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray)
+    # ## Store in matlab for beautiful tikzfigure plots
+    # io.savemat('dic_stabilityalignment_' + name_dataset + '_nfolds.mat', dic_alignment)
+    # io.savemat('dic_singvals_' + name_dataset + '_nfolds.mat', dic_singvalsarray)

    if idx_dataset > 20:
        break

--- a/utils_code/dataset_collection.py
+++ b/utils_code/dataset_collection.py
@@ -46,7 +46,7 @@ class Dataset():
    def example_holzmuller(self, ds_name):
        """Data set from David Holzmüller paper of batch active learning."""

-        path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/'))
+        path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/data/')) + '/'

        X = np.load(path + ds_name + '/X.npy')
        y = np.load(path + ds_name + '/y.npy')

--- a/utils_code/main_function.py
+++ b/utils_code/main_function.py
@@ -19,7 +19,7 @@ from datetime import datetime
 ## Some settings
 list_nctrs = [int(np.round(nr)) for nr in np.logspace(np.log(10) / np.log(10), np.log(1000) / np.log(10), 10)]

-path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/'))
+path_for_indices = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'data/')) + '/'


 # Main functiono to run kernel optimization with subsequent vkoga

--- a/utils_code/optimized_kernel.py
+++ b/utils_code/optimized_kernel.py
 from torch import nn
 import torch
-from utils.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2
+from utils_code.cv_rippa_ext import compute_cv_loss_via_rippa_ext_2
 import numpy as np



--- a/utils_data/custom_paths.py
+++ b/utils_data/custom_paths.py
@@ -3,7 +3,7 @@ import os
 # This file allows to configure where to save data, results, plots etc.
 class CustomPaths:
    # path where downloaded data sets will be saved
-    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/'))
+    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data/')) + '/'

    # path where benchmark results will be saved
    results_path = 'results'