From 4ae12ca7c2dea0246c887bbacaa6c20a20aaaf36 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon@farzon.org>
Date: Thu, 29 Aug 2024 15:07:02 -0400
Subject: [PATCH 1/3] [LegacyPM][DirectX] Add the scalarizer pass for DXIL
 legalization

---
 llvm/include/llvm/InitializePasses.h          |  1 +
 llvm/include/llvm/LinkAllPasses.h             |  1 +
 .../llvm/Transforms/Scalar/Scalarizer.h       | 14 ++++++++
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |  2 ++
 .../Target/DirectX/DirectXTargetMachine.cpp   |  4 +++
 llvm/lib/Transforms/Scalar/Scalar.cpp         |  1 +
 llvm/lib/Transforms/Scalar/Scalarizer.cpp     | 32 ++++++++++++++++++-
 llvm/test/CodeGen/DirectX/sin.ll              | 16 +++++-----
 8 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6605c6fde9251..f8b9f42d9dbec 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -276,6 +276,7 @@ void initializeSafepointIRVerifierPass(PassRegistry &);
 void initializeSelectOptimizePass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
 void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
+void initializeScalarizerLegacyPassPass(PassRegistry&);
 void initializeScavengerTestPass(PassRegistry &);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
 void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 1da02153d846f..92b59a66567c9 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -130,6 +130,7 @@ struct ForcePassLinking {
     (void)llvm::createLowerAtomicPass();
     (void)llvm::createLoadStoreVectorizerPass();
     (void)llvm::createPartiallyInlineLibCallsPass();
+    (void)llvm::createScalarizerPass();
     (void)llvm::createSeparateConstOffsetFromGEPPass();
     (void)llvm::createSpeculativeExecutionPass();
     (void)llvm::createSpeculativeExecutionIfHasBranchDivergencePass();
diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index 45e25cbf28214..7454f00c2ea35 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -19,6 +19,7 @@
 #define LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
 
 #include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
 #include <optional>
 
 namespace llvm {
@@ -50,6 +51,19 @@ class ScalarizerPass : public PassInfoMixin<ScalarizerPass> {
   void setScalarizeLoadStore(bool Value) { Options.ScalarizeLoadStore = Value; }
   void setScalarizeMinBits(unsigned Value) { Options.ScalarizeMinBits = Value; }
 };
+
+/// Create a legacy pass manager instance of the Scalarizer pass
+FunctionPass *createScalarizerPass();
+
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  ScalarizerPassOptions Options;
+  ScalarizerLegacyPass();
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage& AU) const override;
+};
+
 }
 
 #endif /* LLVM_TRANSFORMS_SCALAR_SCALARIZER_H */
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index d98d0bfde04fc..32126612d1481 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 
 #define DEBUG_TYPE "dxil-op-lower"
 
@@ -521,6 +522,7 @@ class DXILOpLoweringLegacy : public ModulePass {
   static char ID; // Pass identification.
   void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
     AU.addRequired<DXILIntrinsicExpansionLegacy>();
+    AU.addRequired<ScalarizerLegacyPass>();
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addPreserved<DXILResourceWrapperPass>();
   }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index a29fc21042163..531e74eda04bc 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCSectionDXContainer.h"
@@ -36,6 +37,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include <optional>
 
 using namespace llvm;
@@ -44,6 +46,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
+  initializeScalarizerLegacyPassPass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
@@ -83,6 +86,7 @@ class DirectXPassConfig : public TargetPassConfig {
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
+    addPass(createScalarizerPass());
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILFinalizeLinkageLegacyPass());
     addPass(createDXILTranslateMetadataLegacyPass());
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 7aeee1d31f7e7..fa6e671830d96 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -21,6 +21,7 @@ using namespace llvm;
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeConstantHoistingLegacyPassPass(Registry);
   initializeDCELegacyPassPass(Registry);
+  initializeScalarizerLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 2bed3480da1cd..ad441914428c4 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -339,9 +340,25 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
   const bool ScalarizeLoadStore;
   const unsigned ScalarizeMinBits;
 };
-
 } // end anonymous namespace
 
+ScalarizerLegacyPass::ScalarizerLegacyPass() : FunctionPass(ID) {
+    Options.ScalarizeVariableInsertExtract = true;
+    Options.ScalarizeLoadStore = true;
+}
+
+void ScalarizerLegacyPass::getAnalysisUsage(AnalysisUsage& AU) const {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+char ScalarizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
+                      "Scalarize vector operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
+                    "Scalarize vector operations", false, false)
+
 Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
                      const VectorSplit &VS, ValueVector *cachePtr)
     : BB(bb), BBI(bbi), V(v), VS(VS), CachePtr(cachePtr) {
@@ -414,6 +431,19 @@ Value *Scatterer::operator[](unsigned Frag) {
   return CV[Frag];
 }
 
+bool ScalarizerLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarizerVisitor Impl(DT, Options);
+  return Impl.visit(F);
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+  return new ScalarizerLegacyPass();
+}
+
 bool ScalarizerVisitor::visit(Function &F) {
   assert(Gathered.empty() && Scattered.empty());
 
diff --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index f309a36c6b8e6..79143bfa0a529 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -7,19 +7,19 @@
 ; Function Attrs: noinline nounwind optnone
 define noundef float @sin_float(float noundef %a) #0 {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %1 = call float @llvm.sin.f32(float %0)
+  %1 = call float @llvm.sin.f32(float %a)
   ret float %1
 }
 
 ; Function Attrs: noinline nounwind optnone
 define noundef half @sin_half(half noundef %a) #0 {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %1 = call half @llvm.sin.f16(half %0)
+  %1 = call half @llvm.sin.f16(half %a)
   ret half %1
 }
+
+define noundef <4 x float> @sin_float4(<4 x float> noundef %a) #0 {
+entry:
+  %2 = call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}

From c554589afef069a3e957f1316898f1883e0ab636 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon@farzon.org>
Date: Wed, 4 Sep 2024 12:50:59 -0400
Subject: [PATCH 2/3] - Update tests - Modify constructor to take options

---
 llvm/include/llvm/InitializePasses.h          |  2 +-
 .../llvm/Transforms/Scalar/Scalarizer.h       | 15 ++----
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |  2 -
 .../Target/DirectX/DirectXTargetMachine.cpp   |  7 ++-
 llvm/lib/Transforms/Scalar/Scalarizer.cpp     | 27 ++++++----
 llvm/test/CodeGen/DirectX/acos.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/asin.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/atan.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/ceil.ll             | 21 +++++++-
 llvm/test/CodeGen/DirectX/cos.ll              | 21 +++++++-
 llvm/test/CodeGen/DirectX/cosh.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/exp2.ll             | 50 +++++++++++--------
 llvm/test/CodeGen/DirectX/fabs.ll             | 22 +++++++-
 llvm/test/CodeGen/DirectX/floor.ll            | 23 ++++++++-
 llvm/test/CodeGen/DirectX/isinf.ll            | 24 ++++-----
 llvm/test/CodeGen/DirectX/reversebits.ll      | 21 +++++++-
 llvm/test/CodeGen/DirectX/round.ll            | 22 +++++++-
 llvm/test/CodeGen/DirectX/saturate.ll         | 17 +++----
 llvm/test/CodeGen/DirectX/scalar-store.ll     | 17 +++++++
 .../DirectX/scalarization_pass_order.ll       | 45 +++++++++++++++++
 llvm/test/CodeGen/DirectX/sin.ll              | 30 ++++++++---
 llvm/test/CodeGen/DirectX/sinh.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/sqrt.ll             | 25 ++++++++--
 llvm/test/CodeGen/DirectX/tan.ll              | 21 +++++++-
 llvm/test/CodeGen/DirectX/tanh.ll             | 21 +++++++-
 llvm/test/CodeGen/DirectX/trunc.ll            | 21 +++++++-
 llvm/tools/opt/optdriver.cpp                  |  1 +
 27 files changed, 472 insertions(+), 108 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/scalar-store.ll
 create mode 100644 llvm/test/CodeGen/DirectX/scalarization_pass_order.ll

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index f8b9f42d9dbec..4352099d6dbb9 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -276,7 +276,7 @@ void initializeSafepointIRVerifierPass(PassRegistry &);
 void initializeSelectOptimizePass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
 void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
-void initializeScalarizerLegacyPassPass(PassRegistry&);
+void initializeScalarizerLegacyPassPass(PassRegistry &);
 void initializeScavengerTestPass(PassRegistry &);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
 void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index 7454f00c2ea35..4d2a1a2f889a3 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -19,12 +19,12 @@
 #define LLVM_TRANSFORMS_SCALAR_SCALARIZER_H
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include <optional>
 
 namespace llvm {
 
 class Function;
+class FunctionPass;
 
 struct ScalarizerPassOptions {
   // These options correspond 1:1 to cl::opt options defined in
@@ -53,17 +53,8 @@ class ScalarizerPass : public PassInfoMixin<ScalarizerPass> {
 };
 
 /// Create a legacy pass manager instance of the Scalarizer pass
-FunctionPass *createScalarizerPass();
-
-class ScalarizerLegacyPass : public FunctionPass {
-public:
-  static char ID;
-  ScalarizerPassOptions Options;
-  ScalarizerLegacyPass();
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage& AU) const override;
-};
-
+FunctionPass *createScalarizerPass(
+    const ScalarizerPassOptions &Options = ScalarizerPassOptions());
 }
 
 #endif /* LLVM_TRANSFORMS_SCALAR_SCALARIZER_H */
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 32126612d1481..d98d0bfde04fc 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -24,7 +24,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar/Scalarizer.h"
 
 #define DEBUG_TYPE "dxil-op-lower"
 
@@ -522,7 +521,6 @@ class DXILOpLoweringLegacy : public ModulePass {
   static char ID; // Pass identification.
   void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
     AU.addRequired<DXILIntrinsicExpansionLegacy>();
-    AU.addRequired<ScalarizerLegacyPass>();
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addPreserved<DXILResourceWrapperPass>();
   }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 531e74eda04bc..f021e24ac7e26 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -26,9 +26,9 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSectionDXContainer.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -86,7 +86,10 @@ class DirectXPassConfig : public TargetPassConfig {
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
-    addPass(createScalarizerPass());
+    ScalarizerPassOptions DxilScalarOptions;
+    // The only non-default option we need to set is ScalarizeLoadStore.
+    DxilScalarOptions.ScalarizeLoadStore = true;
+    addPass(createScalarizerPass(DxilScalarOptions));
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILFinalizeLinkageLegacyPass());
     addPass(createDXILTranslateMetadataLegacyPass());
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index ad441914428c4..01d24335df226 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -340,16 +340,25 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
   const bool ScalarizeLoadStore;
   const unsigned ScalarizeMinBits;
 };
+
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  ScalarizerPassOptions Options;
+  ScalarizerLegacyPass() : FunctionPass(ID), Options() {}
+  ScalarizerLegacyPass(const ScalarizerPassOptions &Options);
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
 } // end anonymous namespace
 
-ScalarizerLegacyPass::ScalarizerLegacyPass() : FunctionPass(ID) {
-    Options.ScalarizeVariableInsertExtract = true;
-    Options.ScalarizeLoadStore = true;
-}
+ScalarizerLegacyPass::ScalarizerLegacyPass(const ScalarizerPassOptions &Options)
+    : FunctionPass(ID), Options(Options) {}
 
-void ScalarizerLegacyPass::getAnalysisUsage(AnalysisUsage& AU) const {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
+void ScalarizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
 }
 
 char ScalarizerLegacyPass::ID = 0;
@@ -440,8 +449,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) {
   return Impl.visit(F);
 }
 
-FunctionPass *llvm::createScalarizerPass() {
-  return new ScalarizerLegacyPass();
+FunctionPass *llvm::createScalarizerPass(const ScalarizerPassOptions &Options) {
+  return new ScalarizerLegacyPass(Options);
 }
 
 bool ScalarizerVisitor::visit(Function &F) {
diff --git a/llvm/test/CodeGen/DirectX/acos.ll b/llvm/test/CodeGen/DirectX/acos.ll
index cc32182395627..f4a10eb368ebf 100644
--- a/llvm/test/CodeGen/DirectX/acos.ll
+++ b/llvm/test/CodeGen/DirectX/acos.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for acos are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @acos_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 15, float %{{.*}})
   %elt.acos = call float @llvm.acos.f32(float %a)
   ret float %elt.acos
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @acos_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 15, half %{{.*}})
   %elt.acos = call half @llvm.acos.f16(half %a)
   ret half %elt.acos
 }
 
+define noundef <4 x float> @acos_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.acos.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.acos.f16(half)
 declare float @llvm.acos.f32(float)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/asin.ll b/llvm/test/CodeGen/DirectX/asin.ll
index 06e3bab545a6a..bd948f593c24e 100644
--- a/llvm/test/CodeGen/DirectX/asin.ll
+++ b/llvm/test/CodeGen/DirectX/asin.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for asin are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @asin_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 16, float %{{.*}})
   %elt.asin = call float @llvm.asin.f32(float %a)
   ret float %elt.asin
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @asin_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 16, half %{{.*}})
   %elt.asin = call half @llvm.asin.f16(half %a)
   ret half %elt.asin
 }
 
+define noundef <4 x float> @asin_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.asin.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.asin.f16(half)
 declare float @llvm.asin.f32(float)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/atan.ll b/llvm/test/CodeGen/DirectX/atan.ll
index d7c4cd00e286a..58899ab49bdb8 100644
--- a/llvm/test/CodeGen/DirectX/atan.ll
+++ b/llvm/test/CodeGen/DirectX/atan.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for atan are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @atan_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 17, float %{{.*}})
   %elt.atan = call float @llvm.atan.f32(float %a)
   ret float %elt.atan
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @atan_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 17, half %{{.*}})
   %elt.atan = call half @llvm.atan.f16(half %a)
   ret half %elt.atan
 }
 
+define noundef <4 x float> @atan_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.atan.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.atan.f16(half)
 declare float @llvm.atan.f32(float)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>) 
diff --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll
index 48bc5495a8e05..bd6e747c2fbf5 100644
--- a/llvm/test/CodeGen/DirectX/ceil.ll
+++ b/llvm/test/CodeGen/DirectX/ceil.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for ceil are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.ceil
 }
 
+define noundef <4 x float> @ceil_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.ceil.f16(half)
 declare float @llvm.ceil.f32(float)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) 
diff --git a/llvm/test/CodeGen/DirectX/cos.ll b/llvm/test/CodeGen/DirectX/cos.ll
index 72f4bfca23f9d..85f5db25570b9 100644
--- a/llvm/test/CodeGen/DirectX/cos.ll
+++ b/llvm/test/CodeGen/DirectX/cos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cos are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.cos
 }
 
+define noundef <4 x float> @cos_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.cos.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.cos.f16(half)
 declare float @llvm.cos.f32(float)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/cosh.ll b/llvm/test/CodeGen/DirectX/cosh.ll
index 91aaf893f3997..670a8a3eae086 100644
--- a/llvm/test/CodeGen/DirectX/cosh.ll
+++ b/llvm/test/CodeGen/DirectX/cosh.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cosh are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @cosh_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 18, float %{{.*}})
   %elt.cosh = call float @llvm.cosh.f32(float %a)
   ret float %elt.cosh
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @cosh_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 18, half %{{.*}})
   %elt.cosh = call half @llvm.cosh.f16(half %a)
   ret half %elt.cosh
 }
 
+define noundef <4 x float> @cosh_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.cosh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.cosh.f16(half)
 declare float @llvm.cosh.f32(float)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/exp2.ll b/llvm/test/CodeGen/DirectX/exp2.ll
index b70b87dedc4d1..6d16af6a5413e 100644
--- a/llvm/test/CodeGen/DirectX/exp2.ll
+++ b/llvm/test/CodeGen/DirectX/exp2.ll
@@ -1,31 +1,39 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.7-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for exp2 are generated for float and half.
-; CHECK:call float @dx.op.unary.f32(i32 21, float %{{.*}})
-; CHECK:call half @dx.op.unary.f16(i32 21, half %{{.*}})
 
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
-
-; Function Attrs: noinline nounwind optnone
-define noundef float @exp2_float(float noundef %a) #0 {
+define noundef float @exp2_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %elt.exp2 = call float @llvm.exp2.f32(float %0)
+  ; CHECK:call float @dx.op.unary.f32(i32 21, float %{{.*}})
+  %elt.exp2 = call float @llvm.exp2.f32(float %a)
   ret float %elt.exp2
 }
 
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare float @llvm.exp2.f32(float) #1
-
-; Function Attrs: noinline nounwind optnone
-define noundef half @exp2_half(half noundef %a) #0 {
+define noundef half @exp2_half(half noundef %a) {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %elt.exp2 = call half @llvm.exp2.f16(half %0)
+  ; CHECK:call half @dx.op.unary.f16(i32 21, half %{{.*}})
+  %elt.exp2 = call half @llvm.exp2.f16(half %a)
   ret half %elt.exp2
 }
+
+define noundef <4 x float> @exp2_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+declare float @llvm.exp2.f32(float)
+declare half @llvm.exp2.f16(half)
+declare  <4 x float> @llvm.exp2.v4f32(<4 x float> %a) 
diff --git a/llvm/test/CodeGen/DirectX/fabs.ll b/llvm/test/CodeGen/DirectX/fabs.ll
index becbdf8d68aeb..6d903f1c927ac 100644
--- a/llvm/test/CodeGen/DirectX/fabs.ll
+++ b/llvm/test/CodeGen/DirectX/fabs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for abs are generated for float, half, and double.
 
@@ -27,6 +27,26 @@ entry:
   ret double %elt.abs
 }
 
+; CHECK-LABEL: fabs_float4
+define noundef <4 x float> @fabs_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.fabs.f16(half)
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/floor.ll b/llvm/test/CodeGen/DirectX/floor.ll
index f79f160e51e3b..8ad81e1459a5b 100644
--- a/llvm/test/CodeGen/DirectX/floor.ll
+++ b/llvm/test/CodeGen/DirectX/floor.ll
@@ -2,19 +2,38 @@
 
 ; Make sure dxil operation function calls for floor are generated for float and half.
 
-define noundef float @floor_float(float noundef %a) #0 {
+define noundef float @floor_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 27, float %{{.*}})
   %elt.floor = call float @llvm.floor.f32(float %a)
   ret float %elt.floor
 }
 
-define noundef half @floor_half(half noundef %a) #0 {
+define noundef half @floor_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 27, half %{{.*}})
   %elt.floor = call half @llvm.floor.f16(half %a)
   ret half %elt.floor
 }
 
+define noundef <4 x float> @floor_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.floor.f16(half)
 declare float @llvm.floor.f32(float)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/isinf.ll b/llvm/test/CodeGen/DirectX/isinf.ll
index 295776b089347..03a00c40498d5 100644
--- a/llvm/test/CodeGen/DirectX/isinf.ll
+++ b/llvm/test/CodeGen/DirectX/isinf.ll
@@ -1,25 +1,21 @@
 ; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for isinf are generated for float and half.
-; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
-; CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 9, half %{{.*}})
 
-; Function Attrs: noinline nounwind optnone
-define noundef i1 @isinf_float(float noundef %a) #0 {
+define noundef i1 @isinf_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %dx.isinf = call i1 @llvm.dx.isinf.f32(float %0)
+  ; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
+  %dx.isinf = call i1 @llvm.dx.isinf.f32(float %a)
   ret i1 %dx.isinf
 }
 
-; Function Attrs: noinline nounwind optnone
-define noundef i1 @isinf_half(half noundef %p0) #0 {
+define noundef i1 @isinf_half(half noundef %a) {
 entry:
-  %p0.addr = alloca half, align 2
-  store half %p0, ptr %p0.addr, align 2
-  %0 = load half, ptr %p0.addr, align 2
-  %dx.isinf = call i1 @llvm.dx.isinf.f16(half %0)
+  ; CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 9, half %{{.*}})
+  %dx.isinf = call i1 @llvm.dx.isinf.f16(half %a)
   ret i1 %dx.isinf
 }
+
+
+declare i1 @llvm.dx.isinf.f16(half)
+declare i1 @llvm.dx.isinf.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/reversebits.ll b/llvm/test/CodeGen/DirectX/reversebits.ll
index 1ade57b40100f..b5530d0850e66 100644
--- a/llvm/test/CodeGen/DirectX/reversebits.ll
+++ b/llvm/test/CodeGen/DirectX/reversebits.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for reversebits are generated for all integer types.
 
@@ -26,6 +26,25 @@ entry:
   ret i64 %elt.bitreverse
 }
 
+define noundef <4 x i32> @round_int324(<4 x i32> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee3]])
+  ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3
+  %2 = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 
+  ret <4 x i32> %2
+}
+
 declare i16 @llvm.bitreverse.i16(i16)
 declare i32 @llvm.bitreverse.i32(i32)
 declare i64 @llvm.bitreverse.i64(i64)
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/DirectX/round.ll b/llvm/test/CodeGen/DirectX/round.ll
index db953fb29c204..b08cbac5f42e9 100644
--- a/llvm/test/CodeGen/DirectX/round.ll
+++ b/llvm/test/CodeGen/DirectX/round.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
 
@@ -18,5 +18,25 @@ entry:
   ret float %elt.roundeven
 }
 
+define noundef <4 x float> @round_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+
 declare half @llvm.roundeven.f16(half)
 declare float @llvm.roundeven.f32(float)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/saturate.ll b/llvm/test/CodeGen/DirectX/saturate.ll
index a8557351756f2..404cab7b665d0 100644
--- a/llvm/test/CodeGen/DirectX/saturate.ll
+++ b/llvm/test/CodeGen/DirectX/saturate.ll
@@ -2,7 +2,7 @@
 ; Make sure the intrinsic dx.saturate is to appropriate DXIL op for half/float/double data types.
 
 ; CHECK-LABEL: test_saturate_half
-define noundef half @test_saturate_half(half noundef %p0) #0 {
+define noundef half @test_saturate_half(half noundef %p0) {
 entry:
   ; CHECK: call half @dx.op.unary.f16(i32 7, half %p0)
   %hlsl.saturate = call half @llvm.dx.saturate.f16(half %p0)
@@ -10,11 +10,8 @@ entry:
   ret half %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare half @llvm.dx.saturate.f16(half) #1
-
 ; CHECK-LABEL: test_saturate_float
-define noundef float @test_saturate_float(float noundef %p0) #0 {
+define noundef float @test_saturate_float(float noundef %p0) {
 entry:
   ; CHECK: call float @dx.op.unary.f32(i32 7, float %p0)
   %hlsl.saturate = call float @llvm.dx.saturate.f32(float %p0)
@@ -22,11 +19,8 @@ entry:
   ret float %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare float @llvm.dx.saturate.f32(float) #1
-
 ; CHECK-LABEL: test_saturate_double
-define noundef double @test_saturate_double(double noundef %p0) #0 {
+define noundef double @test_saturate_double(double noundef %p0) {
 entry:
   ; CHECK: call double @dx.op.unary.f64(i32 7, double %p0)
   %hlsl.saturate = call double @llvm.dx.saturate.f64(double %p0)
@@ -34,6 +28,7 @@ entry:
   ret double %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare double @llvm.dx.saturate.f64(double) #1
+declare half @llvm.dx.saturate.f16(half)
+declare float @llvm.dx.saturate.f32(float)
+declare double @llvm.dx.saturate.f64(double)
 
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
new file mode 100644
index 0000000000000..b970a2842e5a8
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+@"sharedData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16 
+; CHECK-LABEL: store_test
+define void @store_test () local_unnamed_addr {
+    ; CHECK: store float 1.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}}
+    ; CHECK: store float 3.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 4.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 6.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+
+    store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"sharedData", align 16 
+    store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3)   getelementptr inbounds (i8, ptr addrspace(3) @"sharedData", i32 16), align 16 
+    ret void
+ } 
diff --git a/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll b/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll
new file mode 100644
index 0000000000000..f33c2a7ccdcef
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=dxil-pc-shadermodel6.3-library -debug-pass=Structure < %s -o /dev/null 2>&1 | \
+; RUN:     grep -v "Verify generated machine code" | FileCheck %s
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s --check-prefixes=CHECKIR
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: ModulePass Manager
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Scalarize vector operations
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Op Lowering
+; CHECK-NEXT:   DXIL Finalize Linkage
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL resource Information
+; CHECK-NEXT:   DXIL Shader Flag Analysis
+; CHECK-NEXT:   DXIL Translate Metadata
+; CHECK-NEXT:   DXIL Prepare Module
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Metadata Pretty Printer
+; CHECK-NEXT:   Print Module IR
+; CHECKIR: target triple = "dxilv1.3-pc-shadermodel6.3-library"
+; CHECKIR-LABEL: cos_sin_float_test
+define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
+    ; CHECKIR: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+    ; CHECKIR: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
+    ; CHECKIR: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+    ; CHECKIR: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
+    ; CHECKIR: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+    ; CHECKIR: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
+    ; CHECKIR: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+    ; CHECKIR: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
+    ; CHECKIR: [[ie4:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie0]])
+    ; CHECKIR: [[ie5:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie1]])
+    ; CHECKIR: [[ie6:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie2]])
+    ; CHECKIR: [[ie7:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie3]])
+    ; CHECKIR: insertelement <4 x float> poison, float [[ie4]], i64 0
+    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie5]], i64 1
+    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie6]], i64 2
+    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie7]], i64 3
+    %2 = tail call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
+    %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) 
+    ret <4 x float> %3 
+} 
diff --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index 79143bfa0a529..a0b0d2675e3b7 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -1,25 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sin are generated for float and half.
-; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
-; CHECK:call half @dx.op.unary.f16(i32 13, half %{{.*}})
 
-; Function Attrs: noinline nounwind optnone
-define noundef float @sin_float(float noundef %a) #0 {
+define noundef float @sin_float(float noundef %a) {
 entry:
+  ; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
   %1 = call float @llvm.sin.f32(float %a)
   ret float %1
 }
 
-; Function Attrs: noinline nounwind optnone
-define noundef half @sin_half(half noundef %a) #0 {
+define noundef half @sin_half(half noundef %a) {
 entry:
+  ; CHECK:call half @dx.op.unary.f16(i32 13, half %{{.*}})
   %1 = call half @llvm.sin.f16(half %a)
   ret half %1
 }
 
-define noundef <4 x float> @sin_float4(<4 x float> noundef %a) #0 {
+define noundef <4 x float> @sin_float4(<4 x float> noundef %a) {
 entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
   %2 = call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
   ret <4 x float> %2
 }
+
+declare half @llvm.sin.f16(half)
+declare float @llvm.sin.f32(float)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/sinh.ll b/llvm/test/CodeGen/DirectX/sinh.ll
index d4d3eda9eccb6..deba726e8d9ad 100644
--- a/llvm/test/CodeGen/DirectX/sinh.ll
+++ b/llvm/test/CodeGen/DirectX/sinh.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sinh are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @sinh_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 19, float %{{.*}})
   %elt.sinh = call float @llvm.sinh.f32(float %a)
   ret float %elt.sinh
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @sinh_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 19, half %{{.*}})
   %elt.sinh = call half @llvm.sinh.f16(half %a)
   ret half %elt.sinh
 }
 
+define noundef <4 x float> @sinh_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.sinh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.sinh.f16(half)
 declare float @llvm.sinh.f32(float)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/sqrt.ll b/llvm/test/CodeGen/DirectX/sqrt.ll
index 792fbc8d0614d..e2955b4efa2ec 100644
--- a/llvm/test/CodeGen/DirectX/sqrt.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sqrt are generated for float and half.
 
-define noundef float @sqrt_float(float noundef %a) #0 {
+define noundef float @sqrt_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 24, float %{{.*}})
   %elt.sqrt = call float @llvm.sqrt.f32(float %a)
   ret float %elt.sqrt
 }
 
-define noundef half @sqrt_half(half noundef %a) #0 {
+define noundef half @sqrt_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 24, half %{{.*}})
   %elt.sqrt = call half @llvm.sqrt.f16(half %a)
   ret half %elt.sqrt
 }
 
+define noundef <4 x float> @sqrt_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.sqrt.f16(half)
 declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/tan.ll b/llvm/test/CodeGen/DirectX/tan.ll
index 6f7beb592339a..cf6965a95c04e 100644
--- a/llvm/test/CodeGen/DirectX/tan.ll
+++ b/llvm/test/CodeGen/DirectX/tan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tan are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.tan
 }
 
+define noundef <4 x float> @tan_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.tan.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.tan.f16(half)
 declare float @llvm.tan.f32(float)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/tanh.ll b/llvm/test/CodeGen/DirectX/tanh.ll
index e6642d9a74c8a..54ec6f29fa0c3 100644
--- a/llvm/test/CodeGen/DirectX/tanh.ll
+++ b/llvm/test/CodeGen/DirectX/tanh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tanh are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.tanh
 }
 
+define noundef <4 x float> @tanh_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.tanh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.tanh.f16(half)
 declare float @llvm.tanh.f32(float)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/trunc.ll b/llvm/test/CodeGen/DirectX/trunc.ll
index f00b737da4dbb..6d9c222595c44 100644
--- a/llvm/test/CodeGen/DirectX/trunc.ll
+++ b/llvm/test/CodeGen/DirectX/trunc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for trunc are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.trunc
 }
 
+define noundef <4 x float> @trunc_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.trunc.f16(half)
 declare float @llvm.trunc.f32(float)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 1bdfa71830ba2..c5bc7b43e0331 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -375,6 +375,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "fix-irreducible",
       "expand-large-fp-convert",
       "callbrprepare",
+      "scalarizer",
   };
   for (const auto &P : PassNamePrefix)
     if (Pass.starts_with(P))

From 070ee3688125e92d150707af0341ac6df47871d8 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon@farzon.org>
Date: Fri, 6 Sep 2024 13:42:24 -0400
Subject: [PATCH 3/3] address pr comments

---
 .../Target/DirectX/DirectXTargetMachine.cpp   |  1 -
 llvm/test/CodeGen/DirectX/llc-pipeline.ll     | 25 +++++++++++
 .../DirectX/scalarization_pass_order.ll       | 45 -------------------
 .../CodeGen/DirectX/scalarize-two-calls.ll    | 25 +++++++++++
 llvm/test/CodeGen/DirectX/sin.ll              |  2 +-
 5 files changed, 51 insertions(+), 47 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/llc-pipeline.ll
 delete mode 100644 llvm/test/CodeGen/DirectX/scalarization_pass_order.ll
 create mode 100644 llvm/test/CodeGen/DirectX/scalarize-two-calls.ll

diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index f021e24ac7e26..606022a9835f0 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -87,7 +87,6 @@ class DirectXPassConfig : public TargetPassConfig {
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
     ScalarizerPassOptions DxilScalarOptions;
-    // The only non-default option we need to set is ScalarizeLoadStore.
     DxilScalarOptions.ScalarizeLoadStore = true;
     addPass(createScalarizerPass(DxilScalarOptions));
     addPass(createDXILOpLoweringLegacyPass());
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
new file mode 100644
index 0000000000000..36610bef719bf
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=dxil-pc-shadermodel6.3-library -debug-pass=Structure < %s -o /dev/null 2>&1 | \
+; RUN:     grep -v "Verify generated machine code" | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: ModulePass Manager
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Scalarize vector operations
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Op Lowering
+; CHECK-NEXT:   DXIL Finalize Linkage
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL resource Information
+; CHECK-NEXT:   DXIL Shader Flag Analysis
+; CHECK-NEXT:   DXIL Translate Metadata
+; CHECK-NEXT:   DXIL Prepare Module
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Metadata Pretty Printer
+; CHECK-NEXT:   Print Module IR
+ 
diff --git a/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll b/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll
deleted file mode 100644
index f33c2a7ccdcef..0000000000000
--- a/llvm/test/CodeGen/DirectX/scalarization_pass_order.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -mtriple=dxil-pc-shadermodel6.3-library -debug-pass=Structure < %s -o /dev/null 2>&1 | \
-; RUN:     grep -v "Verify generated machine code" | FileCheck %s
-; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s --check-prefixes=CHECKIR
-; CHECK-LABEL: Pass Arguments:
-; CHECK-NEXT: Target Library Information
-; CHECK-NEXT: ModulePass Manager
-; CHECK-NEXT:   DXIL Intrinsic Expansion
-; CHECK-NEXT:   FunctionPass Manager
-; CHECK-NEXT:     Dominator Tree Construction
-; CHECK-NEXT:     Scalarize vector operations
-; CHECK-NEXT:   DXIL Intrinsic Expansion
-; CHECK-NEXT:   DXIL Resource analysis
-; CHECK-NEXT:   DXIL Op Lowering
-; CHECK-NEXT:   DXIL Finalize Linkage
-; CHECK-NEXT:   DXIL Resource analysis
-; CHECK-NEXT:   DXIL resource Information
-; CHECK-NEXT:   DXIL Shader Flag Analysis
-; CHECK-NEXT:   DXIL Translate Metadata
-; CHECK-NEXT:   DXIL Prepare Module
-; CHECK-NEXT:   DXIL Resource analysis
-; CHECK-NEXT:   DXIL Metadata Pretty Printer
-; CHECK-NEXT:   Print Module IR
-; CHECKIR: target triple = "dxilv1.3-pc-shadermodel6.3-library"
-; CHECKIR-LABEL: cos_sin_float_test
-define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
-    ; CHECKIR: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
-    ; CHECKIR: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
-    ; CHECKIR: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
-    ; CHECKIR: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
-    ; CHECKIR: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
-    ; CHECKIR: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
-    ; CHECKIR: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
-    ; CHECKIR: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
-    ; CHECKIR: [[ie4:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie0]])
-    ; CHECKIR: [[ie5:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie1]])
-    ; CHECKIR: [[ie6:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie2]])
-    ; CHECKIR: [[ie7:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie3]])
-    ; CHECKIR: insertelement <4 x float> poison, float [[ie4]], i64 0
-    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie5]], i64 1
-    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie6]], i64 2
-    ; CHECKIR: insertelement <4 x float> %{{.*}}, float [[ie7]], i64 3
-    %2 = tail call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
-    %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) 
-    ret <4 x float> %3 
-} 
diff --git a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
new file mode 100644
index 0000000000000..a14c1de5cc420
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
@@ -0,0 +1,25 @@
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+; CHECK: target triple = "dxilv1.3-pc-shadermodel6.3-library"
+; CHECK-LABEL: cos_sin_float_test
+define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
+    ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+    ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
+    ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+    ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
+    ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+    ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
+    ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+    ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
+    ; CHECK: [[ie4:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie0]])
+    ; CHECK: [[ie5:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie1]])
+    ; CHECK: [[ie6:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie2]])
+    ; CHECK: [[ie7:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie3]])
+    ; CHECK: insertelement <4 x float> poison, float [[ie4]], i64 0
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie5]], i64 1
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie6]], i64 2
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie7]], i64 3
+    %2 = tail call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
+    %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) 
+    ret <4 x float> %3 
+} 
diff --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index a0b0d2675e3b7..ac6b217be80e7 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -36,4 +36,4 @@ entry:
 
 declare half @llvm.sin.f16(half)
 declare float @llvm.sin.f32(float)
-declare <4 x float> @llvm.sin.v4f32(<4 x float>)
\ No newline at end of file
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)