Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,11 @@ jobs:
kh-samples sklearn -i khiops_classifier -e

# Test that the line containing "MPI command" also contains
# "mpiexec", which means that `mpiexec` has been found
kh-status | grep "MPI command" | grep -wq mpiexec
# an executable name under a /bin directory
# Note: this executable name can be different, depending on the MPI
# backend and OS; for instance, "orterun" for OpenMPI on Ubuntu Linux, but
# "mpiexec" for OpenMPI on Rocky Linux
kh-status | grep "MPI command" | grep -Ewq "(/.+?)/bin/.+"
Comment on lines +96 to +100
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is weaker than whaty is stated.

Maybe we should check grep -Ewq "orterun|mpirun|mpiexec" ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I am not sure: what if the executable is called something else (e.g. mpiexec.hydra for MPICH, whence we would also add this, or rather mpiexec\.?.? etc)?

It soon becomes brittle IMHO. In the current state, we just check that the MPI command points to something that is inside a bin directory. The tests that mpiexec is found are elsewhere (in the unit-tests workflow).

So, I would keep it this way; the comments contain "for instance" and this is key IMHO.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I think though the first part should be (.+), and not (.+?)

Copy link
Collaborator Author

@popescu-v popescu-v Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to accommodate the case when the path starts with /bin (this is not excluded in principle I guess).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok

release:
if: github.ref_type == 'tag'
needs: [build, test]
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ jobs:
OMPI_MCA_rmaps_base_oversubscribe: true
PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe
run: |
# Make sure '/bin' is before '/usr/bin' in PATH
PATH=$(echo "/bin:"$PATH | sed 's#:/bin##')

# This is needed so that the Git tag is parsed and the khiops-python
# version is retrieved
git config --global --add safe.directory $(realpath .)
Expand Down Expand Up @@ -177,6 +180,9 @@ jobs:
# Force > 2 CPU cores to launch mpiexec
KHIOPS_PROC_NUMBER: 4
run: |-
# Make sure '/bin' is before '/usr/bin' in PATH
PATH=$(echo "/bin:"$PATH | sed 's#:/bin##')

# Make sure MPI support is not loaded through env modules
# Note: As Docker container's shell is non-interactive, environment
# modules are currently not initializing the shell anyway
Expand Down
15 changes: 11 additions & 4 deletions khiops/core/internals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,8 +1146,12 @@ def _initialize_mpi_command_args(self):
installation_method = _infer_khiops_installation_method()
# In Conda-based, but non-Conda environment, specify mpiexec path
if installation_method == "conda-based":
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.join(
_infer_env_bin_dir_for_conda_based_installations(), "mpiexec"
# Python `os.path.realpath` resolves symlinks recursively, like GNU
# `readlink -f`; Python `os.readlink` does not
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.realpath(
os.path.join(
_infer_env_bin_dir_for_conda_based_installations(), "mpiexec"
)
)
if platform.system() == "Windows" and not os.path.splitext(mpiexec_path):
mpiexec_path += ".exe"
Expand All @@ -1165,8 +1169,11 @@ def _initialize_mpi_command_args(self):
)
# In Conda or local installations, expect mpiexec in the PATH
else:
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or shutil.which(
"mpiexec"
link_to_mpiexec = shutil.which("mpiexec")
mpiexec_path = (
os.environ.get("KHIOPS_MPIEXEC_PATH")
or link_to_mpiexec
and os.path.realpath(link_to_mpiexec)
)
# If mpiexec is not in the path, and the installation method is local,
# then try to load MPI environment module so that mpiexec is in the path
Expand Down
43 changes: 17 additions & 26 deletions khiops/sklearn/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1441,32 +1441,23 @@ def _write_sparse_block(self, row_index, stream, target=None):
assert target in self.target_column, "'target' must be in the target column"
stream.write(f"{target}\t")
row = self.matrix.getrow(row_index)
# Empty row in the sparse matrix: use the first variable as missing data
# TODO: remove this part once Khiops bug
# https://github.com/KhiopsML/khiops/issues/235 is solved
if row.size == 0:
for variable_index in self.column_ids:
stream.write(f"{variable_index + 1}: ")
break
# Non-empty row in the sparse matrix: get non-missing data
else:
# Variable indices are not always sorted in `row.indices`
# Khiops needs variable indices to be sorted
sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort")

# Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API
# is not homogeneous with other sparse matrices: it stores
# opaque Python lists as elements
# Thus:
# - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then
# row.data is np.array([list([...])])
# - else, row.data is np.array([...])
# TODO: remove this flattening once Python 3.8 support is dropped
sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[
sorted_indices.argsort()
]
for variable_index, variable_value in zip(sorted_indices, sorted_data):
stream.write(f"{variable_index + 1}:{variable_value} ")
# Variable indices are not always sorted in `row.indices`
# Khiops needs variable indices to be sorted
sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort")

# Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API
# is not homogeneous with other sparse matrices: it stores
# opaque Python lists as elements
# Thus:
# - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then
# row.data is np.array([list([...])])
# - else, row.data is np.array([...])
# TODO: remove this flattening once Python 3.8 support is dropped
sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[
sorted_indices.argsort()
]
for variable_index, variable_value in zip(sorted_indices, sorted_data):
stream.write(f"{variable_index + 1}:{variable_value} ")
stream.write("\n")

def create_table_file_for_khiops(self, output_dir, sort=True):
Expand Down
16 changes: 9 additions & 7 deletions tests/test_dataset_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,15 @@ def _load_khiops_sparse_file(self, stream):
target, features = line.split(b"\t")
feature_row = np.zeros(100)
for feature in features.strip().split(b" "):
feature_index, feature_value = feature.split(b":")
try:
feature_value = float(feature_value)
# missing value, whence empty string
except ValueError:
feature_value = 0.0
feature_row[int(feature_index) - 1] = feature_value
indexed_feature = feature.split(b":")

# Skip missing feature
if len(indexed_feature) < 2:
continue

# Set feature value in row at the specified index
feature_index, feature_value = indexed_feature
feature_row[int(feature_index) - 1] = float(feature_value)
feature_matrix.append(feature_row)
target_vector.append(float(target))
target_array = np.array(target_vector)
Expand Down