-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Flang] Add -ffast-real-mod and direct code for MOD on REAL types #160660
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-flang-fir-hlfir Author: Michael Klemm (mjklemm) ChangesThis patch adds direct code-gen support for a faster MOD intrinsic for REAL types. Flang has maintained and keeps maintaining a high-precision implementation of the MOD intrinsic as part of the Fortran runtime. With the -ffast-real-mod flag, users can opt to avoid calling into the Fortran runtime, but instead trigger code-gen that produces faster code by avoiding the runtime call, at the expense of potentially risking bit cancelation by having the compiler use the MOD formula a specified by ISO Fortran. Full diff: https://github.com/llvm/llvm-project/pull/160660.diff 7 Files Affected:
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a7c514e809aa9..4dc4acd5603cb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2750,6 +2750,7 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
Group<f_Group>;
def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
+def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
defm reciprocal_math : BoolFOption<"reciprocal-math",
LangOpts<"AllowRecip">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1535f4cebf436..fbaa083d204b8 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -766,6 +766,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
if (ReciprocalMath)
CmdArgs.push_back("-freciprocal-math");
+
+ if (Args.hasArg(options::OPT_ffast_real_mod))
+ CmdArgs.push_back("-ffast-real-mod");
}
static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index ba72d7b4b7212..e310ecf37a52d 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -60,7 +60,8 @@ LANGOPT(OpenMPNoThreadState, 1, 0)
LANGOPT(OpenMPNoNestedParallelism, 1, 0)
/// Use SIMD only OpenMP support.
LANGOPT(OpenMPSimd, 1, false)
-
+/// Enable fast MOD operations for REAL
+LANGOPT(FastRealMod, 1, false)
LANGOPT(VScaleMin, 32, 0) ///< Minimum vscale range value
LANGOPT(VScaleMax, 32, 0) ///< Maximum vscale range value
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 6295a58b1bdad..5b3f64971013e 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1424,6 +1424,10 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
}
+ if (args.hasArg(clang::driver::options::OPT_ffast_real_mod)) {
+ opts.FastRealMod = true;
+ }
+
return true;
}
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 3bef6b1c31825..d22124bc0bdeb 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -277,6 +277,14 @@ bool CodeGenAction::beginSourceFileAction() {
ci.getInvocation().getLangOpts().OpenMPVersion);
}
+ if (ci.getInvocation().getLangOpts().FastRealMod) {
+ auto mod = lb.getModule();
+ mod.getOperation()->setAttr(
+ mlir::StringAttr::get(mod.getContext(),
+ llvm::Twine{"fir.fast_real_mod"}),
+ mlir::BoolAttr::get(mod.getContext(), true));
+ }
+
// Create a parse tree and lower it to FIR
parseAndLowerTree(ci, lb);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ce1376fd209cc..5e0e4fbf81717 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7009,8 +7009,30 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
}
// MOD
+static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value a, mlir::Value p) {
+ auto fastmathFlags = mlir::arith::FastMathFlags::contract;
+ auto fastmathAttr =
+ mlir::arith::FastMathFlagsAttr::get(builder.getContext(), fastmathFlags);
+ mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
+ mlir::Type intType = builder.getIntegerType(
+ a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
+ mlir::Value intResult = builder.createConvert(loc, intType, divResult);
+ mlir::Value cnvResult = builder.createConvert(loc, a.getType(), intResult);
+ mlir::Value mulResult =
+ mlir::arith::MulFOp::create(builder, loc, cnvResult, p, fastmathAttr);
+ mlir::Value subResult =
+ mlir::arith::SubFOp::create(builder, loc, a, mulResult, fastmathAttr);
+ return subResult;
+}
+
mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
+ auto mod = builder.getModule();
+ bool useFastRealMod = false;
+ if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
+ useFastRealMod = attr.getValue();
+
assert(args.size() == 2);
if (resultType.isUnsignedInteger()) {
mlir::Type signlessType = mlir::IntegerType::get(
@@ -7022,9 +7044,18 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
if (mlir::isa<mlir::IntegerType>(resultType))
return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
- // Use runtime.
- return builder.createConvert(
- loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+ if (useFastRealMod) {
+ // If fast MOD for REAL has been requested, generate less precise,
+ // but faster code directly.
+ assert(resultType.isFloat() &&
+ "non floating-point type hit for fast real MOD");
+ return builder.createConvert(loc, resultType,
+ genFastMod(builder, loc, args[0], args[1]));
+ } else {
+ // Use runtime.
+ return builder.createConvert(
+ loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+ }
}
// MODULO
diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
new file mode 100644
index 0000000000000..00607fa5c30d1
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -0,0 +1,75 @@
+! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+
+! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
+
+! CHECK-LABEL: @_QPmod_real4
+subroutine mod_real4(r, a, p)
+ implicit none
+ real(kind=4) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f32
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
+ r = mod(a, p)
+end subroutine mod_real4
+
+! CHECK-LABEL: @_QPmod_real8
+subroutine mod_real8(r, a, p)
+ implicit none
+ real(kind=8) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f64
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
+ r = mod(a, p)
+end subroutine mod_real8
+
+! CHECK-LABEL: @_QPmod_real10
+subroutine mod_real10(r, a, p)
+ implicit none
+ real(kind=10) :: r, a, p
+! CHECK-KIND10: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND10: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND10: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND10: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
+! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f80
+! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+ r = mod(a, p)
+end subroutine mod_real10
+
+! CHECK-LABEL: @_QPmod_real16
+subroutine mod_real16(r, a, p)
+ implicit none
+ real(kind=16) :: r, a, p
+! CHECK-KIND16: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND16: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND16: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND16: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
+! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f128
+! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
+ r = mod(a, p)
+end subroutine mod_real16
|
|
@llvm/pr-subscribers-flang-driver Author: Michael Klemm (mjklemm) ChangesThis patch adds direct code-gen support for a faster MOD intrinsic for REAL types. Flang has maintained and keeps maintaining a high-precision implementation of the MOD intrinsic as part of the Fortran runtime. With the -ffast-real-mod flag, users can opt to avoid calling into the Fortran runtime, but instead trigger code-gen that produces faster code by avoiding the runtime call, at the expense of potentially risking bit cancelation by having the compiler use the MOD formula a specified by ISO Fortran. Full diff: https://github.com/llvm/llvm-project/pull/160660.diff 7 Files Affected:
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a7c514e809aa9..4dc4acd5603cb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2750,6 +2750,7 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
Group<f_Group>;
def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
+def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
defm reciprocal_math : BoolFOption<"reciprocal-math",
LangOpts<"AllowRecip">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1535f4cebf436..fbaa083d204b8 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -766,6 +766,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
if (ReciprocalMath)
CmdArgs.push_back("-freciprocal-math");
+
+ if (Args.hasArg(options::OPT_ffast_real_mod))
+ CmdArgs.push_back("-ffast-real-mod");
}
static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index ba72d7b4b7212..e310ecf37a52d 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -60,7 +60,8 @@ LANGOPT(OpenMPNoThreadState, 1, 0)
LANGOPT(OpenMPNoNestedParallelism, 1, 0)
/// Use SIMD only OpenMP support.
LANGOPT(OpenMPSimd, 1, false)
-
+/// Enable fast MOD operations for REAL
+LANGOPT(FastRealMod, 1, false)
LANGOPT(VScaleMin, 32, 0) ///< Minimum vscale range value
LANGOPT(VScaleMax, 32, 0) ///< Maximum vscale range value
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 6295a58b1bdad..5b3f64971013e 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1424,6 +1424,10 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
}
+ if (args.hasArg(clang::driver::options::OPT_ffast_real_mod)) {
+ opts.FastRealMod = true;
+ }
+
return true;
}
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 3bef6b1c31825..d22124bc0bdeb 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -277,6 +277,14 @@ bool CodeGenAction::beginSourceFileAction() {
ci.getInvocation().getLangOpts().OpenMPVersion);
}
+ if (ci.getInvocation().getLangOpts().FastRealMod) {
+ auto mod = lb.getModule();
+ mod.getOperation()->setAttr(
+ mlir::StringAttr::get(mod.getContext(),
+ llvm::Twine{"fir.fast_real_mod"}),
+ mlir::BoolAttr::get(mod.getContext(), true));
+ }
+
// Create a parse tree and lower it to FIR
parseAndLowerTree(ci, lb);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ce1376fd209cc..5e0e4fbf81717 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7009,8 +7009,30 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
}
// MOD
+static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value a, mlir::Value p) {
+ auto fastmathFlags = mlir::arith::FastMathFlags::contract;
+ auto fastmathAttr =
+ mlir::arith::FastMathFlagsAttr::get(builder.getContext(), fastmathFlags);
+ mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
+ mlir::Type intType = builder.getIntegerType(
+ a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
+ mlir::Value intResult = builder.createConvert(loc, intType, divResult);
+ mlir::Value cnvResult = builder.createConvert(loc, a.getType(), intResult);
+ mlir::Value mulResult =
+ mlir::arith::MulFOp::create(builder, loc, cnvResult, p, fastmathAttr);
+ mlir::Value subResult =
+ mlir::arith::SubFOp::create(builder, loc, a, mulResult, fastmathAttr);
+ return subResult;
+}
+
mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
+ auto mod = builder.getModule();
+ bool useFastRealMod = false;
+ if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
+ useFastRealMod = attr.getValue();
+
assert(args.size() == 2);
if (resultType.isUnsignedInteger()) {
mlir::Type signlessType = mlir::IntegerType::get(
@@ -7022,9 +7044,18 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
if (mlir::isa<mlir::IntegerType>(resultType))
return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
- // Use runtime.
- return builder.createConvert(
- loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+ if (useFastRealMod) {
+ // If fast MOD for REAL has been requested, generate less precise,
+ // but faster code directly.
+ assert(resultType.isFloat() &&
+ "non floating-point type hit for fast real MOD");
+ return builder.createConvert(loc, resultType,
+ genFastMod(builder, loc, args[0], args[1]));
+ } else {
+ // Use runtime.
+ return builder.createConvert(
+ loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+ }
}
// MODULO
diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
new file mode 100644
index 0000000000000..00607fa5c30d1
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -0,0 +1,75 @@
+! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+
+! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
+
+! CHECK-LABEL: @_QPmod_real4
+subroutine mod_real4(r, a, p)
+ implicit none
+ real(kind=4) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f32
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
+ r = mod(a, p)
+end subroutine mod_real4
+
+! CHECK-LABEL: @_QPmod_real8
+subroutine mod_real8(r, a, p)
+ implicit none
+ real(kind=8) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f64
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
+ r = mod(a, p)
+end subroutine mod_real8
+
+! CHECK-LABEL: @_QPmod_real10
+subroutine mod_real10(r, a, p)
+ implicit none
+ real(kind=10) :: r, a, p
+! CHECK-KIND10: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND10: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND10: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND10: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
+! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f80
+! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+ r = mod(a, p)
+end subroutine mod_real10
+
+! CHECK-LABEL: @_QPmod_real16
+subroutine mod_real16(r, a, p)
+ implicit none
+ real(kind=16) :: r, a, p
+! CHECK-KIND16: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND16: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND16: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND16: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
+! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f128
+! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
+ r = mod(a, p)
+end subroutine mod_real16
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall just some minor comments. Also see the clang-format and CI failures.
| @@ -0,0 +1,75 @@ | |||
| ! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%} | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think using kinds which are not enabled for a particular target is a compile error. Please could you put the checks for kind 10 and kind 16 in separate files and skip the whole thing if the types are not supported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was trying to follow what mod.f90 did. Alas, I failed to do everything that test did. Now the test in this PR does it the same way. Hope that's OK and an acceptable way how I have addressed your comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That test is using lines like
integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
real(kind16) :: r, a, p
This make sure that on systems that don't support real(16) it will instead use real(4).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just some minor comments.
It would be good to have a test that explicitly checks that the driver passes -ffast-real-mod to fc1.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for the changes! I have a couple of comments inlined.
| // Use runtime. | ||
| return builder.createConvert( | ||
| loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1])); | ||
| if (useFastRealMod && resultType.isFloat()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we can enable the fast MOD under afn FastMathFlag. Is there a reason to control it via a separate option?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was doing that to avoid issues with precision loss that could have come in with the other math functions. After I spoke to the engineers, who are working on the applications, it seems they'd be fine with moving this under the afn flag and do the optimization as part of -ffast-math which is enabled for this application.
However, I'm tempted to keep this as a separate flag and simply enable this MOD optimization it when -ffast-math is present. That makes it more controllable w.r.t. to the other math optimizations. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It sounds okay to me. I would recommend enabling this optimization whenever afn is set (i.e. under -ffast-math or under -fapprox-func), and allow to override this with -fno-fast-real-mod.
| Group<f_Group>; | ||
| def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>; | ||
| def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>; | ||
| def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please also add -fno-fast-real-mod?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added it my local copy for now and push it with my next update.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the updates. LGTM once Slava and Tarun are happy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you!
…vm#160660) This patch adds direct code-gen support for a faster MOD intrinsic for REAL types. Flang has maintained and keeps maintaining a high-precision implementation of the MOD intrinsic as part of the Fortran runtime. With the -ffast-real-mod flag, users can opt to avoid calling into the Fortran runtime, but instead trigger code-gen that produces faster code by avoiding the runtime call, at the expense of potentially risking bit cancelation by having the compiler use the MOD formula a specified by ISO Fortran.
This patch adds direct code-gen support for a faster MOD intrinsic for REAL types. Flang has maintained and keeps maintaining a high-precision implementation of the MOD intrinsic as part of the Fortran runtime. With the -ffast-real-mod flag, users can opt to avoid calling into the Fortran runtime, but instead trigger code-gen that produces faster code by avoiding the runtime call, at the expense of potentially risking bit cancelation by having the compiler use the MOD formula a specified by ISO Fortran.