From 39490a59cdb0ab6dd827079397fba3abcb89297f Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 31 Jul 2024 14:46:23 +0200 Subject: [PATCH 1/3] Fully support missing sparse data Khiops now supports it, following PR https://github.com/KhiopsML/khiops/pull/241 which fixes issue https://github.com/KhiopsML/khiops/issues/235. --- khiops/sklearn/tables.py | 43 +++++++++++++++---------------------- tests/test_dataset_class.py | 16 ++++++++------ 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 331d1a02..b4709671 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -1441,32 +1441,23 @@ def _write_sparse_block(self, row_index, stream, target=None): assert target in self.target_column, "'target' must be in the target column" stream.write(f"{target}\t") row = self.matrix.getrow(row_index) - # Empty row in the sparse matrix: use the first variable as missing data - # TODO: remove this part once Khiops bug - # https://github.com/KhiopsML/khiops/issues/235 is solved - if row.size == 0: - for variable_index in self.column_ids: - stream.write(f"{variable_index + 1}: ") - break - # Non-empty row in the sparse matrix: get non-missing data - else: - # Variable indices are not always sorted in `row.indices` - # Khiops needs variable indices to be sorted - sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") - - # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API - # is not homogeneous with other sparse matrices: it stores - # opaque Python lists as elements - # Thus: - # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then - # row.data is np.array([list([...])]) - # - else, row.data is np.array([...]) - # TODO: remove this flattening once Python 3.8 support is dropped - sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ - sorted_indices.argsort() - ] - for variable_index, variable_value in zip(sorted_indices, sorted_data): - stream.write(f"{variable_index + 1}:{variable_value} ") + # Variable indices are not always sorted in `row.indices` + # Khiops needs variable indices to be sorted + sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") + + # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API + # is not homogeneous with other sparse matrices: it stores + # opaque Python lists as elements + # Thus: + # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then + # row.data is np.array([list([...])]) + # - else, row.data is np.array([...]) + # TODO: remove this flattening once Python 3.8 support is dropped + sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ + sorted_indices.argsort() + ] + for variable_index, variable_value in zip(sorted_indices, sorted_data): + stream.write(f"{variable_index + 1}:{variable_value} ") stream.write("\n") def create_table_file_for_khiops(self, output_dir, sort=True): diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 9fff03cb..3a0d417f 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -565,13 +565,15 @@ def _load_khiops_sparse_file(self, stream): target, features = line.split(b"\t") feature_row = np.zeros(100) for feature in features.strip().split(b" "): - feature_index, feature_value = feature.split(b":") - try: - feature_value = float(feature_value) - # missing value, whence empty string - except ValueError: - feature_value = 0.0 - feature_row[int(feature_index) - 1] = feature_value + indexed_feature = feature.split(b":") + + # Skip missing feature + if len(indexed_feature) < 2: + continue + + # Set feature value in row at the specified index + feature_index, feature_value = indexed_feature + feature_row[int(feature_index) - 1] = float(feature_value) feature_matrix.append(feature_row) target_vector.append(float(target)) target_array = np.array(target_vector) From 7074dd57b16ae0bd635b8abbad9eb96c907239c2 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 30 Jul 2024 16:08:37 +0200 Subject: [PATCH 2/3] Add '/bin' before '/usr/bin' in PATH in the CI Thus, mpiexec can be /bin/mpiexec through symlinks related_to #209 --- .github/workflows/pip.yml | 7 +++++-- .github/workflows/unit-tests.yml | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 093b4c06..e1d17107 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -93,8 +93,11 @@ jobs: kh-samples sklearn -i khiops_classifier -e # Test that the line containing "MPI command" also contains - # "mpiexec", which means that `mpiexec` has been found - kh-status | grep "MPI command" | grep -wq mpiexec + # an executable name under a /bin directory + # Note: this executable name can be different, depending on the MPI + # backend and OS; for instance, "orterun" for OpenMPI on Ubuntu Linux, but + # "mpiexec" for OpenMPI on Rocky Linux + kh-status | grep "MPI command" | grep -Ewq "(/.+?)/bin/.+" release: if: github.ref_type == 'tag' needs: [build, test] diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a4341b25..cc9a2382 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -99,6 +99,9 @@ jobs: OMPI_MCA_rmaps_base_oversubscribe: true PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: | + # Make sure '/bin' is before '/usr/bin' in PATH + PATH=$(echo "/bin:"$PATH | sed 's#:/bin##') + # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved git config --global --add safe.directory $(realpath .) @@ -177,6 +180,9 @@ jobs: # Force > 2 CPU cores to launch mpiexec KHIOPS_PROC_NUMBER: 4 run: |- + # Make sure '/bin' is before '/usr/bin' in PATH + PATH=$(echo "/bin:"$PATH | sed 's#:/bin##') + # Make sure MPI support is not loaded through env modules # Note: As Docker container's shell is non-interactive, environment # modules are currently not initializing the shell anyway From 33586e007fa375bfae74fdddd6e13403378771df Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 30 Jul 2024 16:20:27 +0200 Subject: [PATCH 3/3] Compute real path to mpiexec Otherwise, in some situations, we get to /bin/mpiexec (which is a symlink to the real path) which makes OpenMPI fail; see: https://github.com/open-mpi/ompi/issues/5613 closes #209 --- khiops/core/internals/runner.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index d8681a00..28cb30e7 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -1146,8 +1146,12 @@ def _initialize_mpi_command_args(self): installation_method = _infer_khiops_installation_method() # In Conda-based, but non-Conda environment, specify mpiexec path if installation_method == "conda-based": - mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.join( - _infer_env_bin_dir_for_conda_based_installations(), "mpiexec" + # Python `os.path.realpath` resolves symlinks recursively, like GNU + # `readlink -f`; Python `os.readlink` does not + mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.realpath( + os.path.join( + _infer_env_bin_dir_for_conda_based_installations(), "mpiexec" + ) ) if platform.system() == "Windows" and not os.path.splitext(mpiexec_path): mpiexec_path += ".exe" @@ -1165,8 +1169,11 @@ def _initialize_mpi_command_args(self): ) # In Conda or local installations, expect mpiexec in the PATH else: - mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or shutil.which( - "mpiexec" + link_to_mpiexec = shutil.which("mpiexec") + mpiexec_path = ( + os.environ.get("KHIOPS_MPIEXEC_PATH") + or link_to_mpiexec + and os.path.realpath(link_to_mpiexec) ) # If mpiexec is not in the path, and the installation method is local, # then try to load MPI environment module so that mpiexec is in the path