Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/opal_config_asm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ dnl
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2018 The University of Tennessee and The University
dnl Copyright (c) 2004-2020 The University of Tennessee and The University
dnl of Tennessee Research Foundation. All rights
dnl reserved.
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -1245,7 +1245,7 @@ AC_MSG_ERROR([Can not continue.])

# Check for RDTSCP support
result=0
AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
[AC_MSG_CHECKING([for RDTSCP assembly support])
AC_LANG_PUSH([C])
AC_TRY_RUN([[
Expand Down
101 changes: 101 additions & 0 deletions ompi/mca/op/avx/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Copyright (c) 2019-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2020 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

# This component provide support for the Advanced Vector Extensions (AVX)
# available in recent versions of x86 processors.
#
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
# for more details on how to make Open MPI components.

# First, list all .h and .c sources. It is necessary to list all .h
# files so that they will be picked up in the distribution tarball.

sources = op_avx_component.c op_avx.h
sources_extended = op_avx_functions.c

# Open MPI components can be compiled two ways:
#
# 1. As a standalone dynamic shared object (DSO), sometimes called a
# dynamically loadable library (DLL).
#
# 2. As a static library that is slurped up into the upper-level
# libmpi library (regardless of whether libmpi is a static or dynamic
# library). This is called a "Libtool convenience library".
#
# The component needs to create an output library in this top-level
# component directory, and named either mca_<type>_<name>.la (for DSO
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
# build system will have set the
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
# which way this component should be built.

# We need to support all processors from early AVX to full AVX512 support, based on
# a decision made at runtime. So, we generate all combinations of capabilities, and
# we will select the most suitable (based on the processor flags) during the
# component initialization.
specialized_op_libs =
if MCA_BUILD_ompi_op_has_avx_support
specialized_op_libs += liblocal_ops_avx.la
liblocal_ops_avx_la_SOURCES = $(sources_extended)
liblocal_ops_avx_la_CFLAGS = @MCA_BUILD_OP_AVX_FLAGS@
liblocal_ops_avx_la_CPPFLAGS = -DGENERATE_AVX_CODE
if MCA_BUILD_ompi_op_has_sse3_support
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE3_CODE
endif
if MCA_BUILD_ompi_op_has_sse41_support
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE41_CODE
endif
endif
if MCA_BUILD_ompi_op_has_avx2_support
specialized_op_libs += liblocal_ops_avx2.la
liblocal_ops_avx2_la_SOURCES = $(sources_extended)
liblocal_ops_avx2_la_CFLAGS = @MCA_BUILD_OP_AVX2_FLAGS@
liblocal_ops_avx2_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE
endif
if MCA_BUILD_ompi_op_has_avx512_support
specialized_op_libs += liblocal_ops_avx512.la
liblocal_ops_avx512_la_SOURCES = $(sources_extended)
liblocal_ops_avx512_la_CFLAGS = @MCA_BUILD_OP_AVX512_FLAGS@
liblocal_ops_avx512_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE -DGENERATE_AVX512_CODE
endif

component_noinst = $(specialized_op_libs)
if MCA_BUILD_ompi_op_avx_DSO
component_install = mca_op_avx.la
else
component_install =
component_noinst += libmca_op_avx.la
endif

# Specific information for DSO builds.
#
# The DSO should install itself in $(ompilibdir) (by default,
# $prefix/lib/openmpi).

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_op_avx_la_SOURCES = $(sources)
mca_op_avx_la_LIBADD = $(specialized_op_libs)
mca_op_avx_la_LDFLAGS = -module -avoid-version


# Specific information for static builds.
#
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
# slurp in the resulting .la library into libmpi.

noinst_LTLIBRARIES = $(component_noinst)
libmca_op_avx_la_SOURCES = $(sources)
libmca_op_avx_la_LIBADD = $(specialized_op_libs)
libmca_op_avx_la_LDFLAGS = -module -avoid-version

265 changes: 265 additions & 0 deletions ompi/mca/op/avx/configure.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
# -*- shell-script -*-
#
# Copyright (c) 2019-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])

MCA_BUILD_OP_AVX_FLAGS=""
MCA_BUILD_OP_AVX2_FLAGS=""
MCA_BUILD_OP_AVX512_FLAGS=""
op_sse3_support=0
op_sse41_support=0
op_avx_support=0
op_avx2_support=0
op_avx512_support=0
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])

AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
[AC_LANG_PUSH([C])

#
# Check for AVX512 support
#
AC_MSG_CHECKING([for AVX512 support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m512 vA, vB;
_mm512_add_ps(vA, vB)
]])],
[op_avx512_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])

AS_IF([test $op_avx512_support -eq 0],
[AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -march=skylake-avx512"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m512 vA, vB;
_mm512_add_ps(vA, vB)
]])],
[op_avx512_support=1
MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Some combination of gcc and older as would not correctly build the code generated by
# _mm256_loadu_si256. Screen them out.
#
AS_IF([test $op_avx512_support -eq 1],
[AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
__m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
]])],
[AC_MSG_RESULT([yes])],
[op_avx512_support=0
MCA_BUILD_OP_AVX512_FLAGS=""
AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Check support for AVX2
#
AC_MSG_CHECKING([for AVX2 support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m256 vA, vB;
_mm256_add_ps(vA, vB)
]])],
[op_avx2_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
AS_IF([test $op_avx2_support -eq 0],
[AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -mavx2"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m256 vA, vB;
_mm256_add_ps(vA, vB)
]])],
[op_avx2_support=1
MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# Some combination of gcc and older as would not correctly build the code generated by
# _mm256_loadu_si256. Screen them out.
#
AS_IF([test $op_avx2_support -eq 1],
[AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
__m256i vA = _mm256_loadu_si256((__m256i*)&A)
]])],
[AC_MSG_RESULT([yes])],
[op_avx2_support=0
MCA_BUILD_OP_AVX2_FLAGS=""
AC_MSG_RESULT([no])])
CFLAGS="$op_avx_cflags_save"
])
#
# What about early AVX support. The rest of the logic is slightly different as
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
# if we can compile AVX code without a flag, then we validate that we have support
# for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
# instructions.
#
AC_MSG_CHECKING([for AVX support (no additional flags)])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128 vA, vB;
_mm_add_ps(vA, vB)
]])],
[op_avx_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
#
# Check for SSE4.1 support
#
AS_IF([test $op_avx_support -eq 1],
[AC_MSG_CHECKING([for SSE4.1 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128i vA, vB;
(void)_mm_max_epi8(vA, vB)
]])],
[op_sse41_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
#
# Check for SSE3 support
#
AS_IF([test $op_avx_support -eq 1],
[AC_MSG_CHECKING([for SSE3 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[4] = {0, 1, 2, 3};
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
]])],
[op_sse3_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
# Second pass, do we need to add the AVX flag ?
AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
[AC_MSG_CHECKING([for AVX support (with -mavx)])
op_avx_cflags_save="$CFLAGS"
CFLAGS="$CFLAGS -mavx"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128 vA, vB;
_mm_add_ps(vA, vB)
]])],
[op_avx_support=1
MCA_BUILD_OP_AVX_FLAGS="-mavx"
op_sse41_support=0
op_sse3_support=0
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])

AS_IF([test $op_sse41_support -eq 0],
[AC_MSG_CHECKING([for SSE4.1 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
__m128i vA, vB;
(void)_mm_max_epi8(vA, vB)
]])],
[op_sse41_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
AS_IF([test $op_sse3_support -eq 0],
[AC_MSG_CHECKING([for SSE3 support])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
[[
int A[4] = {0, 1, 2, 3};
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
]])],
[op_sse3_support=1
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
])
CFLAGS="$op_avx_cflags_save"
])

AC_LANG_POP([C])
])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
[$op_avx512_support],
[AVX512 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
[$op_avx2_support],
[AVX2 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
[$op_avx_support],
[AVX supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE41],
[$op_sse41_support],
[SSE4.1 supported in the current build])
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE3],
[$op_sse3_support],
[SSE3 supported in the current build])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx512_support],
[test "$op_avx512_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx2_support],
[test "$op_avx2_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx_support],
[test "$op_avx_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse41_support],
[test "$op_sse41_support" == "1"])
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse3_support],
[test "$op_sse3_support" == "1"])
AC_SUBST(MCA_BUILD_OP_AVX512_FLAGS)
AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)

OPAL_VAR_SCOPE_POP
# Enable this component iff we have at least the most basic form of support
# for vectorial ISA
AS_IF([test $op_avx_support -eq 1 || test $op_avx2_support -eq 1 || test $op_avx512_support -eq 1],
[$1],
[$2])

])dnl
Loading