From 8f9868d88549814a7f81bd75732cc3bd6e875bf9 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 15 Sep 2025 16:02:18 +0200
Subject: [PATCH 01/15] Add first rough implementation of -ffast-real-mod

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ce1376fd209cc..5f78fe6592dfc 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7009,8 +7009,24 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 }
 
 // MOD
+static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
+                              mlir::Value a, mlir::Value p) {
+  mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p);
+  fprintf(stderr, "--> int type width: %d\n", a.getType().getIntOrFloatBitWidth());
+  mlir::Type intType = builder.getIntegerType(
+      a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
+  mlir::Value intResult = builder.createConvert(loc, intType, divResult);
+  mlir::Value cnvResult = builder.createConvert(loc, a.getType(), intResult);
+  mlir::Value mulResult =
+      mlir::arith::MulFOp::create(builder, loc, cnvResult, p);
+  mlir::Value subResult =
+      mlir::arith::SubFOp::create(builder, loc, a, mulResult);
+  return subResult;
+}
+
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
+  bool useFastRealMod = true;
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
     mlir::Type signlessType = mlir::IntegerType::get(
@@ -7022,9 +7038,19 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType))
     return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
-  // Use runtime.
-  return builder.createConvert(
-      loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+  if (useFastRealMod) {
+    // If fast MOD for REAL has been requested, generate less precise,
+    // but faster code directly.
+    assert(resultType.isFloat() &&
+           "non floating-point type hit for fast real MOD");
+    fprintf(stderr, "--> emitting fast mod operation for MOD\n");
+    return builder.createConvert(loc, resultType,
+                                 genFastMod(builder, loc, args[0], args[1]));
+  } else {
+    // Use runtime.
+    return builder.createConvert(
+        loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+  }
 }
 
 // MODULO

From ecae88cd454a9bd82b5bdd990dca570ed24fa6f8 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 15 Sep 2025 20:04:41 +0200
Subject: [PATCH 02/15] Add command line flag

---
 clang/include/clang/Driver/Options.td         | 2 +-
 clang/lib/Driver/ToolChains/Flang.cpp         | 5 +++++
 flang/include/flang/Support/LangOptions.def   | 3 ++-
 flang/lib/Frontend/CompilerInvocation.cpp     | 6 ++++++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 7 +++++++
 5 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a7c514e809aa9..3293a91d107b2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2750,6 +2750,7 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
   Group<f_Group>;
 def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
+def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
 defm reciprocal_math : BoolFOption<"reciprocal-math",
   LangOpts<"AllowRecip">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
@@ -7373,7 +7374,6 @@ def emit_mlir : Flag<["-"], "emit-mlir">, Alias<emit_fir>;
 
 def emit_hlfir : Flag<["-"], "emit-hlfir">, Group<Action_Group>,
   HelpText<"Build the parse tree, then lower it to HLFIR">;
-
 } // let Visibility = [FC1Option]
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1535f4cebf436..1969bfd08e27c 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -766,6 +766,11 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
 
   if (ReciprocalMath)
     CmdArgs.push_back("-freciprocal-math");
+
+  if (Args.hasArg(options::OPT_ffast_real_mod)) {
+    fprintf(stderr, "##> -ffast-real-mod: %d\n", options::OPT_ffast_real_mod);
+    CmdArgs.push_back("-ffast-real-mod");
+  }
 }
 
 static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index ba72d7b4b7212..e310ecf37a52d 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -60,7 +60,8 @@ LANGOPT(OpenMPNoThreadState, 1, 0)
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
 /// Use SIMD only OpenMP support.
 LANGOPT(OpenMPSimd, 1, false)
-
+/// Enable fast MOD operations for REAL
+LANGOPT(FastRealMod, 1, false)
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 6295a58b1bdad..e93af6d7b9c38 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1424,6 +1424,12 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
+  if (args.hasArg(clang::driver::options::OPT_ffast_real_mod)) {
+    fprintf(stderr, "$$> FC1: -ffast-real-mod: %d\n", (int) opts.FastRealMod);
+    opts.FastRealMod = true;
+    fprintf(stderr, "$$> FC1: -ffast-real-mod: %d\n", (int) opts.FastRealMod);
+  }
+
   return true;
 }
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 5f78fe6592dfc..41d0ef830b2b1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -46,6 +46,7 @@
 #include "flang/Optimizer/Support/Utils.h"
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/iostat-consts.h"
+#include "flang/Support/LangOptions.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
@@ -7027,6 +7028,11 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   bool useFastRealMod = true;
+  auto mod = builder.getModule();
+  if (auto attr = mod->getAttrOfType<mlir::omp::VersionAttr>("omp.version"))
+      fprintf(stderr, "omp version: %d\n", attr.getVersion());
+
+  fprintf(stderr, "--> -ffast-real-mod: %d\n", (int) useFastRealMod);
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
     mlir::Type signlessType = mlir::IntegerType::get(
@@ -7048,6 +7054,7 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                  genFastMod(builder, loc, args[0], args[1]));
   } else {
     // Use runtime.
+    fprintf(stderr, "--> emitting slow path MOD\n");
     return builder.createConvert(
         loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
   }

From 85b14eb1d8889f22487b482af4741bf27d6a9d2a Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 22 Sep 2025 19:34:48 +0200
Subject: [PATCH 03/15] Pass -ffast-real-mod via MLIR module attribute to
 code-gen

---
 flang/lib/Frontend/FrontendActions.cpp        |  7 +++++++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 12 ++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 3bef6b1c31825..614d2edf606f1 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -277,6 +277,13 @@ bool CodeGenAction::beginSourceFileAction() {
                               ci.getInvocation().getLangOpts().OpenMPVersion);
   }
 
+  if (ci.getInvocation().getLangOpts().FastRealMod) {
+    fprintf(stderr, "YAY!!!!\n");
+    auto mod = lb.getModule();
+    mod.getOperation()->setAttr(mlir::StringAttr::get(mod.getContext(), llvm::Twine{"fir.fast_real_mod"}),
+      mlir::BoolAttr::get(mod.getContext(), true));
+  }
+
   // Create a parse tree and lower it to FIR
   parseAndLowerTree(ci, lb);
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 41d0ef830b2b1..64dcebcf021e3 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7027,12 +7027,16 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
 
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
-  bool useFastRealMod = true;
   auto mod = builder.getModule();
-  if (auto attr = mod->getAttrOfType<mlir::omp::VersionAttr>("omp.version"))
-      fprintf(stderr, "omp version: %d\n", attr.getVersion());
-
+  bool useFastRealMod = false;
+  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod")) {
+    fprintf(stderr, "fir.fast_real_mod present: %d\n", (int) attr.getValue());
+    useFastRealMod = attr.getValue();
+  } else {
+    fprintf(stderr, "fir.fast_real_mod not present\n");
+  }
   fprintf(stderr, "--> -ffast-real-mod: %d\n", (int) useFastRealMod);
+
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
     mlir::Type signlessType = mlir::IntegerType::get(

From d32863a8e54308e07eb023a046f2d75eea61b77a Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 22 Sep 2025 20:06:46 +0200
Subject: [PATCH 04/15] Clean up code

---
 clang/lib/Driver/ToolChains/Flang.cpp         |  4 +---
 flang/lib/Frontend/CompilerInvocation.cpp     |  2 --
 flang/lib/Frontend/FrontendActions.cpp        |  7 ++++---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 10 +---------
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1969bfd08e27c..fbaa083d204b8 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -767,10 +767,8 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
   if (ReciprocalMath)
     CmdArgs.push_back("-freciprocal-math");
 
-  if (Args.hasArg(options::OPT_ffast_real_mod)) {
-    fprintf(stderr, "##> -ffast-real-mod: %d\n", options::OPT_ffast_real_mod);
+  if (Args.hasArg(options::OPT_ffast_real_mod))
     CmdArgs.push_back("-ffast-real-mod");
-  }
 }
 
 static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index e93af6d7b9c38..5b3f64971013e 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1425,9 +1425,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
   }
 
   if (args.hasArg(clang::driver::options::OPT_ffast_real_mod)) {
-    fprintf(stderr, "$$> FC1: -ffast-real-mod: %d\n", (int) opts.FastRealMod);
     opts.FastRealMod = true;
-    fprintf(stderr, "$$> FC1: -ffast-real-mod: %d\n", (int) opts.FastRealMod);
   }
 
   return true;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 614d2edf606f1..d22124bc0bdeb 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -278,10 +278,11 @@ bool CodeGenAction::beginSourceFileAction() {
   }
 
   if (ci.getInvocation().getLangOpts().FastRealMod) {
-    fprintf(stderr, "YAY!!!!\n");
     auto mod = lb.getModule();
-    mod.getOperation()->setAttr(mlir::StringAttr::get(mod.getContext(), llvm::Twine{"fir.fast_real_mod"}),
-      mlir::BoolAttr::get(mod.getContext(), true));
+    mod.getOperation()->setAttr(
+        mlir::StringAttr::get(mod.getContext(),
+                              llvm::Twine{"fir.fast_real_mod"}),
+        mlir::BoolAttr::get(mod.getContext(), true));
   }
 
   // Create a parse tree and lower it to FIR
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 64dcebcf021e3..fbb03bf9f0291 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7013,7 +7013,6 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Value a, mlir::Value p) {
   mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p);
-  fprintf(stderr, "--> int type width: %d\n", a.getType().getIntOrFloatBitWidth());
   mlir::Type intType = builder.getIntegerType(
       a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
   mlir::Value intResult = builder.createConvert(loc, intType, divResult);
@@ -7029,13 +7028,8 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   auto mod = builder.getModule();
   bool useFastRealMod = false;
-  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod")) {
-    fprintf(stderr, "fir.fast_real_mod present: %d\n", (int) attr.getValue());
+  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
     useFastRealMod = attr.getValue();
-  } else {
-    fprintf(stderr, "fir.fast_real_mod not present\n");
-  }
-  fprintf(stderr, "--> -ffast-real-mod: %d\n", (int) useFastRealMod);
 
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
@@ -7053,12 +7047,10 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
     // but faster code directly.
     assert(resultType.isFloat() &&
            "non floating-point type hit for fast real MOD");
-    fprintf(stderr, "--> emitting fast mod operation for MOD\n");
     return builder.createConvert(loc, resultType,
                                  genFastMod(builder, loc, args[0], args[1]));
   } else {
     // Use runtime.
-    fprintf(stderr, "--> emitting slow path MOD\n");
     return builder.createConvert(
         loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
   }

From d06a1adbde034e502d08ad16c3e858de396cfbea Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 22 Sep 2025 20:48:21 +0200
Subject: [PATCH 05/15] Add test

---
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 flang/test/Lower/Intrinsics/fast-real-mod.f90

diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
new file mode 100644
index 0000000000000..26422e305cbe8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -0,0 +1,57 @@
+! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s
+
+! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
+
+! CHECK-LABEL: @_QPmod_real4
+subroutine mod_real4(r, a, p)
+    implicit none
+    real(kind=4) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
+! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f32
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
+    r = mod(a, p)
+end subroutine mod_real4
+
+! CHECK-LABEL: @_QPmod_real8
+subroutine mod_real8(r, a, p)
+    implicit none
+    real(kind=8) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
+! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f64
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
+    r = mod(a, p)
+end subroutine mod_real8
+
+! CHECK-LABEL: @_QPmod_real10
+subroutine mod_real10(r, a, p)
+    implicit none
+    real(kind=10) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
+! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f80
+! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f80
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+    r = mod(a, p)
+end subroutine mod_real10

From 313abd0266b4a643ce71905266ad0ad16a99a73b Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 22 Sep 2025 21:22:10 +0200
Subject: [PATCH 06/15] Improve test and add kind=16 test

---
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 26422e305cbe8..3bdd5930a1706 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
 
@@ -42,16 +42,34 @@ end subroutine mod_real8
 subroutine mod_real10(r, a, p)
     implicit none
     real(kind=10) :: r, a, p
-! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
-! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
-! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
-! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
-! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
-! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
-! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
-! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
-! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f80
-! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f80
-! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+! CHECK-KIND10: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND10: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND10: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND10: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
+! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f80
+! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
     r = mod(a, p)
 end subroutine mod_real10
+
+! CHECK-LABEL: @_QPmod_real16
+subroutine mod_real16(r, a, p)
+    implicit none
+    real(kind=16) :: r, a, p
+! CHECK-KIND16: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND16: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND16: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND16: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
+! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f128
+! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
+    r = mod(a, p)
+end subroutine mod_real16

From 2572cc07389281d00ed0129d07e84bc9bf4d79b4 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Tue, 23 Sep 2025 14:30:18 +0200
Subject: [PATCH 07/15] Don't use hard-coded register numbers

---
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 3bdd5930a1706..00607fa5c30d1 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -14,8 +14,8 @@ subroutine mod_real4(r, a, p)
 ! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f32
 ! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
 ! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
-! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f32
-! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f32
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f32
 ! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
     r = mod(a, p)
 end subroutine mod_real4
@@ -32,8 +32,8 @@ subroutine mod_real8(r, a, p)
 ! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f64
 ! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
 ! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
-! CHECK: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f64
-! CHECK: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f64
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f64
 ! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
     r = mod(a, p)
 end subroutine mod_real8
@@ -50,8 +50,8 @@ subroutine mod_real10(r, a, p)
 ! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
 ! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
 ! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
-! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f80
-! CHECK-KIND10: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f80
 ! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
     r = mod(a, p)
 end subroutine mod_real10
@@ -68,8 +68,8 @@ subroutine mod_real16(r, a, p)
 ! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f128
 ! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
 ! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
-! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %8, %5 fastmath<contract> : f128
-! CHECK-KIND16: %[[SUB:.*]] = arith.subf %4, %9 fastmath<contract> : f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f128
 ! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
     r = mod(a, p)
 end subroutine mod_real16

From 7cc56df58bd4283639a9c62159cb57ecab8dd113 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Tue, 23 Sep 2025 14:47:15 +0200
Subject: [PATCH 08/15] Honor -ffast-math when present

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index fbb03bf9f0291..1274164f25813 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7012,15 +7012,18 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 // MOD
 static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Value a, mlir::Value p) {
-  mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p);
+  auto fastmathFlags = mlir::arith::FastMathFlags::contract;
+  auto fastmathAttr =
+      mlir::arith::FastMathFlagsAttr::get(builder.getContext(), fastmathFlags);
+  mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
   mlir::Type intType = builder.getIntegerType(
       a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
   mlir::Value intResult = builder.createConvert(loc, intType, divResult);
   mlir::Value cnvResult = builder.createConvert(loc, a.getType(), intResult);
   mlir::Value mulResult =
-      mlir::arith::MulFOp::create(builder, loc, cnvResult, p);
+      mlir::arith::MulFOp::create(builder, loc, cnvResult, p, fastmathAttr);
   mlir::Value subResult =
-      mlir::arith::SubFOp::create(builder, loc, a, mulResult);
+      mlir::arith::SubFOp::create(builder, loc, a, mulResult, fastmathAttr);
   return subResult;
 }
 

From ed6885752083177a33929b9ebbe27546819ecba9 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 25 Sep 2025 08:45:51 +0200
Subject: [PATCH 09/15] Remove unwanted changes

---
 clang/include/clang/Driver/Options.td         | 1 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 3293a91d107b2..4dc4acd5603cb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7374,6 +7374,7 @@ def emit_mlir : Flag<["-"], "emit-mlir">, Alias<emit_fir>;
 
 def emit_hlfir : Flag<["-"], "emit-hlfir">, Group<Action_Group>,
   HelpText<"Build the parse tree, then lower it to HLFIR">;
+
 } // let Visibility = [FC1Option]
 
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 1274164f25813..5e0e4fbf81717 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -46,7 +46,6 @@
 #include "flang/Optimizer/Support/Utils.h"
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/iostat-consts.h"
-#include "flang/Support/LangOptions.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"

From afc2063c71f9596dd03e6a1b95a1ced3d5d01561 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 25 Sep 2025 13:49:37 +0200
Subject: [PATCH 10/15] Format code

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 5e0e4fbf81717..dfcf034ae21d6 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7014,7 +7014,8 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
   auto fastmathFlags = mlir::arith::FastMathFlags::contract;
   auto fastmathAttr =
       mlir::arith::FastMathFlagsAttr::get(builder.getContext(), fastmathFlags);
-  mlir::Value divResult = mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
+  mlir::Value divResult =
+      mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
   mlir::Type intType = builder.getIntegerType(
       a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
   mlir::Value intResult = builder.createConvert(loc, intType, divResult);

From 3b392c858592b8cf9f333c3c4cfb361112b528e3 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 25 Sep 2025 15:49:08 +0200
Subject: [PATCH 11/15] Follow suit of the test in
 flang/Lower/Intrinsics/mod.f90

---
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 00607fa5c30d1..62b2c4d58af02 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -41,7 +41,8 @@ end subroutine mod_real8
 ! CHECK-LABEL: @_QPmod_real10
 subroutine mod_real10(r, a, p)
     implicit none
-    real(kind=10) :: r, a, p
+    integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+    real(kind=kind10) :: r, a, p
 ! CHECK-KIND10: %[[A:.*]] = fir.declare{{.*}}a"
 ! CHECK-KIND10: %[[P:.*]] = fir.declare{{.*}}p"
 ! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
@@ -59,7 +60,8 @@ end subroutine mod_real10
 ! CHECK-LABEL: @_QPmod_real16
 subroutine mod_real16(r, a, p)
     implicit none
-    real(kind=16) :: r, a, p
+    integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+    real(kind=kind16) :: r, a, p
 ! CHECK-KIND16: %[[A:.*]] = fir.declare{{.*}}a"
 ! CHECK-KIND16: %[[P:.*]] = fir.declare{{.*}}p"
 ! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"

From 6d5836c0d0175d349d3a8e2926349c9151e9830a Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 25 Sep 2025 16:54:30 +0200
Subject: [PATCH 12/15] Address reviewer comments

---
 flang/lib/Frontend/CompilerInvocation.cpp     | 3 +--
 flang/lib/Frontend/FrontendActions.cpp        | 2 +-
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 +---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 5b3f64971013e..d876d6a741303 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1424,9 +1424,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
-  if (args.hasArg(clang::driver::options::OPT_ffast_real_mod)) {
+  if (args.hasArg(clang::driver::options::OPT_ffast_real_mod))
     opts.FastRealMod = true;
-  }
 
   return true;
 }
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index d22124bc0bdeb..c5e5bc11547a8 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -278,7 +278,7 @@ bool CodeGenAction::beginSourceFileAction() {
   }
 
   if (ci.getInvocation().getLangOpts().FastRealMod) {
-    auto mod = lb.getModule();
+    mlir::ModuleOp mod = lb.getModule();
     mod.getOperation()->setAttr(
         mlir::StringAttr::get(mod.getContext(),
                               llvm::Twine{"fir.fast_real_mod"}),
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index dfcf034ae21d6..591e194ed2891 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7045,11 +7045,9 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType))
     return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
-  if (useFastRealMod) {
+  if (useFastRealMod && resultType.isFloat()) {
     // If fast MOD for REAL has been requested, generate less precise,
     // but faster code directly.
-    assert(resultType.isFloat() &&
-           "non floating-point type hit for fast real MOD");
     return builder.createConvert(loc, resultType,
                                  genFastMod(builder, loc, args[0], args[1]));
   } else {

From 5c8304dbb8d72800b9d0ab6e5be9856eb2e0f24d Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 25 Sep 2025 18:25:34 +0200
Subject: [PATCH 13/15] Add Flang driver check for -ffast-real-mod

---
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 62b2c4d58af02..6cb90fe9fe233 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -1,5 +1,8 @@
+! RUN: %flang -ffast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-FAST-REAL-MOD
 ! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
+! CHECK-FAST-REAL-MOD: "-ffast-real-mod"
+
 ! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
 
 ! CHECK-LABEL: @_QPmod_real4

From 52c48db8b64010d8500d7a3041f2a5b518d520e0 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Fri, 26 Sep 2025 16:09:20 +0200
Subject: [PATCH 14/15] Add -fno-fast-real-mod

---
 clang/include/clang/Driver/Options.td         | 1 +
 clang/lib/Driver/ToolChains/Flang.cpp         | 2 ++
 flang/lib/Frontend/CompilerInvocation.cpp     | 2 ++
 flang/test/Driver/fast-real-mod.f90           | 9 +++++++++
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 3 ---
 flang/test/Lower/Intrinsics/mod.f90           | 1 +
 6 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Driver/fast-real-mod.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4dc4acd5603cb..32a36f4f788ac 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2751,6 +2751,7 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
 def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
 def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
+def fno_fast_real_mod : Flag<["-"], "fno-fast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
 defm reciprocal_math : BoolFOption<"reciprocal-math",
   LangOpts<"AllowRecip">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index fbaa083d204b8..19d3d639a71d3 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -769,6 +769,8 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
 
   if (Args.hasArg(options::OPT_ffast_real_mod))
     CmdArgs.push_back("-ffast-real-mod");
+  if (Args.hasArg(options::OPT_fno_fast_real_mod))
+    CmdArgs.push_back("-fno-fast-real-mod");
 }
 
 static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index d876d6a741303..afbb4fe34f536 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1426,6 +1426,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
 
   if (args.hasArg(clang::driver::options::OPT_ffast_real_mod))
     opts.FastRealMod = true;
+  if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
+    opts.FastRealMod = false;
 
   return true;
 }
diff --git a/flang/test/Driver/fast-real-mod.f90 b/flang/test/Driver/fast-real-mod.f90
new file mode 100644
index 0000000000000..8184f334c3d85
--- /dev/null
+++ b/flang/test/Driver/fast-real-mod.f90
@@ -0,0 +1,9 @@
+! RUN: %flang -ffast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-FAST-REAL-MOD
+! RUN: %flang -fno-fast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-NO-FAST-REAL-MOD
+
+! CHECK-FAST-REAL-MOD: "-ffast-real-mod"
+! CHECK-NO-FAST-REAL-MOD: "-fno-fast-real-mod"
+
+program test
+    ! nothing to be done in here
+end program test
diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 6cb90fe9fe233..62b2c4d58af02 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -1,8 +1,5 @@
-! RUN: %flang -ffast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-FAST-REAL-MOD
 ! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
-! CHECK-FAST-REAL-MOD: "-ffast-real-mod"
-
 ! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
 
 ! CHECK-LABEL: @_QPmod_real4
diff --git a/flang/test/Lower/Intrinsics/mod.f90 b/flang/test/Lower/Intrinsics/mod.f90
index 5bc81d923b800..0577168bfbf8e 100644
--- a/flang/test/Lower/Intrinsics/mod.f90
+++ b/flang/test/Lower/Intrinsics/mod.f90
@@ -1,4 +1,5 @@
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+! RUN: %flang_fc1 -ffast-real-mod -fno-fast-real-mod -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: func @_QPmod_testr4(
 subroutine mod_testr4(r, a, p)

From d7beb16c26818ec2330f5fc9340c826caab3c88c Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Tue, 30 Sep 2025 22:37:12 +0200
Subject: [PATCH 15/15] Put the MOD optimization under AFN and add
 -fno-fast-real-mod

---
 clang/include/clang/Driver/Options.td         |  5 +--
 clang/lib/Driver/ToolChains/Flang.cpp         |  8 ++---
 flang/include/flang/Support/LangOptions.def   |  2 +-
 flang/lib/Frontend/CompilerInvocation.cpp     |  4 +--
 flang/lib/Frontend/FrontendActions.cpp        |  4 +--
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 14 ++++----
 flang/test/Driver/fast-real-mod.f90           |  2 --
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 34 +++++++++++--------
 flang/test/Lower/Intrinsics/mod.f90           |  1 -
 9 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 32a36f4f788ac..c86d17cfc4289 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2750,8 +2750,9 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
   Group<f_Group>;
 def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
-def ffast_real_mod : Flag<["-"], "ffast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
-def fno_fast_real_mod : Flag<["-"], "fno-fast-real-mod">, Visibility<[FlangOption, FC1Option]>, Group<f_Group>;
+def fno_fast_real_mod : Flag<["-"], "fno-fast-real-mod">,
+  Group<f_Group>, Visibility<[FlangOption, FC1Option]>,
+  HelpText<"Disable optimization of MOD for REAL types in presence of -ffast-math">;
 defm reciprocal_math : BoolFOption<"reciprocal-math",
   LangOpts<"AllowRecip">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 19d3d639a71d3..5cea602bbacb8 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -739,6 +739,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
                                          complexRangeKindToStr(Range)));
   }
 
+  if (Args.hasArg(options::OPT_fno_fast_real_mod))
+    CmdArgs.push_back("-fno-fast-real-mod");
+
   if (!HonorINFs && !HonorNaNs && AssociativeMath && ReciprocalMath &&
       ApproxFunc && !SignedZeros &&
       (FPContract == "fast" || FPContract.empty())) {
@@ -766,11 +769,6 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
 
   if (ReciprocalMath)
     CmdArgs.push_back("-freciprocal-math");
-
-  if (Args.hasArg(options::OPT_ffast_real_mod))
-    CmdArgs.push_back("-ffast-real-mod");
-  if (Args.hasArg(options::OPT_fno_fast_real_mod))
-    CmdArgs.push_back("-fno-fast-real-mod");
 }
 
 static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index e310ecf37a52d..e7185c836f45b 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -61,7 +61,7 @@ LANGOPT(OpenMPNoNestedParallelism, 1, 0)
 /// Use SIMD only OpenMP support.
 LANGOPT(OpenMPSimd, 1, false)
 /// Enable fast MOD operations for REAL
-LANGOPT(FastRealMod, 1, false)
+LANGOPT(NoFastRealMod, 1, false)
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index afbb4fe34f536..ba0e294971774 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1424,10 +1424,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
-  if (args.hasArg(clang::driver::options::OPT_ffast_real_mod))
-    opts.FastRealMod = true;
   if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
-    opts.FastRealMod = false;
+    opts.NoFastRealMod = true;
 
   return true;
 }
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index c5e5bc11547a8..e9e741f9ad308 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -277,11 +277,11 @@ bool CodeGenAction::beginSourceFileAction() {
                               ci.getInvocation().getLangOpts().OpenMPVersion);
   }
 
-  if (ci.getInvocation().getLangOpts().FastRealMod) {
+  if (ci.getInvocation().getLangOpts().NoFastRealMod) {
     mlir::ModuleOp mod = lb.getModule();
     mod.getOperation()->setAttr(
         mlir::StringAttr::get(mod.getContext(),
-                              llvm::Twine{"fir.fast_real_mod"}),
+                              llvm::Twine{"fir.no_fast_real_mod"}),
         mlir::BoolAttr::get(mod.getContext(), true));
   }
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 591e194ed2891..4a326963db69c 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -7030,9 +7030,11 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   auto mod = builder.getModule();
-  bool useFastRealMod = false;
-  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
-    useFastRealMod = attr.getValue();
+  bool dontUseFastRealMod = false;
+  bool canUseApprox = mlir::arith::bitEnumContainsAny(
+      builder.getFastMathFlags(), mlir::arith::FastMathFlags::afn);
+  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.no_fast_real_mod"))
+    dontUseFastRealMod = attr.getValue();
 
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
@@ -7045,9 +7047,9 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType))
     return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
-  if (useFastRealMod && resultType.isFloat()) {
-    // If fast MOD for REAL has been requested, generate less precise,
-    // but faster code directly.
+  if (resultType.isFloat() && canUseApprox && !dontUseFastRealMod) {
+    // Treat MOD as an approximate function and code-gen inline code
+    // instead of calling into the Fortran runtime library.
     return builder.createConvert(loc, resultType,
                                  genFastMod(builder, loc, args[0], args[1]));
   } else {
diff --git a/flang/test/Driver/fast-real-mod.f90 b/flang/test/Driver/fast-real-mod.f90
index 8184f334c3d85..4ea9b26e64753 100644
--- a/flang/test/Driver/fast-real-mod.f90
+++ b/flang/test/Driver/fast-real-mod.f90
@@ -1,7 +1,5 @@
-! RUN: %flang -ffast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-FAST-REAL-MOD
 ! RUN: %flang -fno-fast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-NO-FAST-REAL-MOD
 
-! CHECK-FAST-REAL-MOD: "-ffast-real-mod"
 ! CHECK-NO-FAST-REAL-MOD: "-fno-fast-real-mod"
 
 program test
diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
index 62b2c4d58af02..f80f7203ad1a2 100644
--- a/flang/test/Lower/Intrinsics/fast-real-mod.f90
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -1,6 +1,8 @@
-! RUN: %flang_fc1 -ffast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+! RUN: %flang_fc1 -ffast-math -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+! RUN: %flang_fc1 -ffast-math -fno-fast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK-NFRM%if target=x86_64{{.*}} %{,CHECK-NFRM-KIND10%}%if flang-supports-f128-math %{,CHECK-NFRM-KIND16%}
 
-! CHECK: module attributes {{{.*}}fir.fast_real_mod = true{{.*}}}
+! TODO: check line that fir.fast_real_mod is not there
+! CHECK-NFRM: module attributes {{{.*}}fir.no_fast_real_mod = true{{.*}}}
 
 ! CHECK-LABEL: @_QPmod_real4
 subroutine mod_real4(r, a, p)
@@ -11,12 +13,13 @@ subroutine mod_real4(r, a, p)
 ! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
 ! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
 ! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
-! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f32
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f32
 ! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
 ! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
-! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f32
-! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f32
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f32
 ! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
+! CHECK-NFRM: fir.call @_FortranAModReal4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f32, f32, !fir.ref<i8>, i32) -> f32
     r = mod(a, p)
 end subroutine mod_real4
 
@@ -29,12 +32,13 @@ subroutine mod_real8(r, a, p)
 ! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
 ! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
 ! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
-! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f64
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f64
 ! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
 ! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
-! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f64
-! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f64
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f64
 ! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
+! CHECK-NFRM: fir.call @_FortranAModReal8(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f64, f64, !fir.ref<i8>, i32) -> f64
     r = mod(a, p)
 end subroutine mod_real8
 
@@ -48,12 +52,13 @@ subroutine mod_real10(r, a, p)
 ! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
 ! CHECK-KIND10: %[[A_LOAD:.*]] = fir.load %[[A]]
 ! CHECK-KIND10: %[[P_LOAD:.*]] = fir.load %[[P]]
-! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f80
+! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f80
 ! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
 ! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
-! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f80
-! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f80
 ! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+! CHECK-NFRM-KIND10: fir.call @_FortranAModReal10(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f80, f80, !fir.ref<i8>, i32) -> f80
     r = mod(a, p)
 end subroutine mod_real10
 
@@ -67,11 +72,12 @@ subroutine mod_real16(r, a, p)
 ! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"
 ! CHECK-KIND16: %[[A_LOAD:.*]] = fir.load %[[A]]
 ! CHECK-KIND16: %[[P_LOAD:.*]] = fir.load %[[P]]
-! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<contract> : f128
+! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f128
 ! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
 ! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
-! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<contract> : f128
-! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<contract> : f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f128
 ! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
+! CHECK-NFRM-KIND16: fir.call @_FortranAModReal16(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
     r = mod(a, p)
 end subroutine mod_real16
diff --git a/flang/test/Lower/Intrinsics/mod.f90 b/flang/test/Lower/Intrinsics/mod.f90
index 0577168bfbf8e..5bc81d923b800 100644
--- a/flang/test/Lower/Intrinsics/mod.f90
+++ b/flang/test/Lower/Intrinsics/mod.f90
@@ -1,5 +1,4 @@
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
-! RUN: %flang_fc1 -ffast-real-mod -fno-fast-real-mod -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: func @_QPmod_testr4(
 subroutine mod_testr4(r, a, p)