diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 093b4c06..e1d17107 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -93,8 +93,11 @@ jobs: kh-samples sklearn -i khiops_classifier -e # Test that the line containing "MPI command" also contains - # "mpiexec", which means that `mpiexec` has been found - kh-status | grep "MPI command" | grep -wq mpiexec + # an executable name under a /bin directory + # Note: this executable name can be different, depending on the MPI + # backend and OS; for instance, "orterun" for OpenMPI on Ubuntu Linux, but + # "mpiexec" for OpenMPI on Rocky Linux + kh-status | grep "MPI command" | grep -Ewq "(/.+?)/bin/.+" release: if: github.ref_type == 'tag' needs: [build, test] diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a4341b25..cc9a2382 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -99,6 +99,9 @@ jobs: OMPI_MCA_rmaps_base_oversubscribe: true PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: | + # Make sure '/bin' is before '/usr/bin' in PATH + PATH=$(echo "/bin:"$PATH | sed 's#:/bin##') + # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved git config --global --add safe.directory $(realpath .) @@ -177,6 +180,9 @@ jobs: # Force > 2 CPU cores to launch mpiexec KHIOPS_PROC_NUMBER: 4 run: |- + # Make sure '/bin' is before '/usr/bin' in PATH + PATH=$(echo "/bin:"$PATH | sed 's#:/bin##') + # Make sure MPI support is not loaded through env modules # Note: As Docker container's shell is non-interactive, environment # modules are currently not initializing the shell anyway diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index d8681a00..28cb30e7 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -1146,8 +1146,12 @@ def _initialize_mpi_command_args(self): installation_method = _infer_khiops_installation_method() # In Conda-based, but non-Conda environment, specify mpiexec path if installation_method == "conda-based": - mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.join( - _infer_env_bin_dir_for_conda_based_installations(), "mpiexec" + # Python `os.path.realpath` resolves symlinks recursively, like GNU + # `readlink -f`; Python `os.readlink` does not + mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.realpath( + os.path.join( + _infer_env_bin_dir_for_conda_based_installations(), "mpiexec" + ) ) if platform.system() == "Windows" and not os.path.splitext(mpiexec_path): mpiexec_path += ".exe" @@ -1165,8 +1169,11 @@ def _initialize_mpi_command_args(self): ) # In Conda or local installations, expect mpiexec in the PATH else: - mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or shutil.which( - "mpiexec" + link_to_mpiexec = shutil.which("mpiexec") + mpiexec_path = ( + os.environ.get("KHIOPS_MPIEXEC_PATH") + or link_to_mpiexec + and os.path.realpath(link_to_mpiexec) ) # If mpiexec is not in the path, and the installation method is local, # then try to load MPI environment module so that mpiexec is in the path diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 331d1a02..b4709671 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -1441,32 +1441,23 @@ def _write_sparse_block(self, row_index, stream, target=None): assert target in self.target_column, "'target' must be in the target column" stream.write(f"{target}\t") row = self.matrix.getrow(row_index) - # Empty row in the sparse matrix: use the first variable as missing data - # TODO: remove this part once Khiops bug - # https://github.com/KhiopsML/khiops/issues/235 is solved - if row.size == 0: - for variable_index in self.column_ids: - stream.write(f"{variable_index + 1}: ") - break - # Non-empty row in the sparse matrix: get non-missing data - else: - # Variable indices are not always sorted in `row.indices` - # Khiops needs variable indices to be sorted - sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") - - # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API - # is not homogeneous with other sparse matrices: it stores - # opaque Python lists as elements - # Thus: - # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then - # row.data is np.array([list([...])]) - # - else, row.data is np.array([...]) - # TODO: remove this flattening once Python 3.8 support is dropped - sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ - sorted_indices.argsort() - ] - for variable_index, variable_value in zip(sorted_indices, sorted_data): - stream.write(f"{variable_index + 1}:{variable_value} ") + # Variable indices are not always sorted in `row.indices` + # Khiops needs variable indices to be sorted + sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") + + # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API + # is not homogeneous with other sparse matrices: it stores + # opaque Python lists as elements + # Thus: + # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then + # row.data is np.array([list([...])]) + # - else, row.data is np.array([...]) + # TODO: remove this flattening once Python 3.8 support is dropped + sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ + sorted_indices.argsort() + ] + for variable_index, variable_value in zip(sorted_indices, sorted_data): + stream.write(f"{variable_index + 1}:{variable_value} ") stream.write("\n") def create_table_file_for_khiops(self, output_dir, sort=True): diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 9fff03cb..3a0d417f 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -565,13 +565,15 @@ def _load_khiops_sparse_file(self, stream): target, features = line.split(b"\t") feature_row = np.zeros(100) for feature in features.strip().split(b" "): - feature_index, feature_value = feature.split(b":") - try: - feature_value = float(feature_value) - # missing value, whence empty string - except ValueError: - feature_value = 0.0 - feature_row[int(feature_index) - 1] = feature_value + indexed_feature = feature.split(b":") + + # Skip missing feature + if len(indexed_feature) < 2: + continue + + # Set feature value in row at the specified index + feature_index, feature_value = indexed_feature + feature_row[int(feature_index) - 1] = float(feature_value) feature_matrix.append(feature_row) target_vector.append(float(target)) target_array = np.array(target_vector)