[Clang][AArch64][SVE2p3][SME2p3] Add intrinsics for v9.7a add/add-and-subtract/subtract pairwise operations#187527
[Clang][AArch64][SVE2p3][SME2p3] Add intrinsics for v9.7a add/add-and-subtract/subtract pairwise operations#187527
Conversation
|
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-ir Author: Amilendra Kodithuwakku (amilendra) ChangesAdd the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics)
Patch is 72.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/187527.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index be3cd8a76503b..5bc48c7bde799 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1421,6 +1421,17 @@ defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp", [
defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp", [VerifyRuntimeMode]>;
}
+////////////////////////////////////////////////////////////////////////////////
+// SVE2.3 - Add pairwise within quadword vector segments
+
+let SVETargetGuard = "sve2p3|sme2p3", SMETargetGuard = "sve2p3|sme2p3" in {
+def SVADDQP : SInst<"svaddqp[_{d}]", "ddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addqp",
+ [VerifyRuntimeMode]>;
+def SVADDSUBP : SInst<"svaddsubp[_{d}]", "ddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addsubp",
+ [VerifyRuntimeMode]>;
+def SVSUBP : SInst<"svsubp[_{d}]", "dPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_subp", [VerifyRuntimeMode]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SVE2 - Widening pairwise arithmetic
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addqp.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addqp.c
new file mode 100644
index 0000000000000..50eb8515f04e1
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addqp.c
@@ -0,0 +1,262 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
+#define ATTR __arm_streaming_compatible
+#elif defined(__ARM_FEATURE_SME)
+#define ATTR __arm_streaming
+#else
+#define ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svaddqp_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.addqp.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svaddqp_s8u10__SVInt8_tS_(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.addqp.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP2]]
+//
+svint8_t test_svaddqp_s8(svint8_t zn, svint8_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_s8)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svaddqp_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.addqp.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svaddqp_u8u11__SVUint8_tS_(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.addqp.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP2]]
+//
+svuint8_t test_svaddqp_u8(svuint8_t zn, svuint8_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_u8)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svaddqp_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.addqp.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z16test_svaddqp_s16u11__SVInt16_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.addqp.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svaddqp_s16(svint16_t zn, svint16_t zm)ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_s16)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svaddqp_u16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.addqp.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z16test_svaddqp_u16u12__SVUint16_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.addqp.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svaddqp_u16(svuint16_t zn, svuint16_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_u16)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svaddqp_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.addqp.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z16test_svaddqp_s32u11__SVInt32_tS_(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.addqp.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svint32_t test_svaddqp_s32(svint32_t zn, svint32_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_s32)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svaddqp_u32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.addqp.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z16test_svaddqp_u32u12__SVUint32_tS_(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.addqp.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svuint32_t test_svaddqp_u32(svuint32_t zn, svuint32_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_u32)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svaddqp_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 2 x i64>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.addqp.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z16test_svaddqp_s64u11__SVInt64_tS_(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 2 x i64>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.addqp.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svint64_t test_svaddqp_s64(svint64_t zn, svint64_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_s64)(zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svaddqp_u64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 2 x i64>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.addqp.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z16test_svaddqp_u64u12__SVUint64_tS_(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 2 x i64>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.addqp.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+//
+svuint64_t test_svaddqp_u64(svuint64_t zn, svuint64_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svaddqp,_u64)(zn, zm);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addsubp.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addsubp.c
new file mode 100644
index 0000000000000..afea0a51cb910
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addsubp.c
@@ -0,0 +1,262 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme ...
[truncated]
|
clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addqp.c
Outdated
Show resolved
Hide resolved
|
We need to add IR tests to this patch as well. |
2288965 to
a14c847
Compare
CarolineConcatto
left a comment
There was a problem hiding this comment.
I left a comments in the ACLE PR, but I think SUBP should have :https://developer.arm.com/documentation/ddi0602/2025-09/SVE-Instructions/SUBPT--predicated---Subtract-checked-pointer-vectors--predicated--?lang=en
Should have the following prototypes:
_m, _x, and _z like we see for svadd:
https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiesreturnbasetype=[%5Buint,int%5D]&q=svadd
Because it should allow to zero, merge or dont care the non-active lane masks.
clang/include/clang/Basic/arm_sve.td
Outdated
| [VerifyRuntimeMode]>; | ||
| def SVADDSUBP : SInst<"svaddsubp[_{d}]", "ddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addsubp", | ||
| [VerifyRuntimeMode]>; | ||
| def SVSUBP : SInst<"svsubp[_{d}]", "dPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_subp", [VerifyRuntimeMode]>; |
There was a problem hiding this comment.
This needs to be updated according to the latest change in the ACLE
a14c847 to
2c18705
Compare
clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_addqp.c
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
It looks like all three test files are using the same RUN lines & macro. Would it be worth squashing them into one acle_sve2p1_addsubp.c file?
There was a problem hiding this comment.
I think having separate files is more extendable and easy to understand?
I think that would require unifying the SVE_ACLE_FUNC macro across the files as well.
subp: #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
addqp and addsubp: #define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
…-subtract/subtract pairwise operations Add the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics) - ADDQP (Add pairwise within quadword vector segments) - svint8_t svaddqp_s8(svint8_t, svint8_t) / svint8_t svaddqp(svint8_t, svint8_t) - svuint8_t svaddqp_u8(svuint8_t, svuint8_t) / svuint8_t svaddqp(svuint8_t, svuint8_t) - svint16_t svaddqp_s16(svint16_t, svint16_t) / svint16_t svaddqp(svint16_t, svint16_t) - svuint16_t svaddqp_u16(svuint16_t, svuint16_t) / svuint16_t svaddqp(svuint16_t, svuint16_t) - svint32_t svaddqp_s32(svint32_t, svint32_t) / svint32_t svaddqp(svint32_t, svint32_t) - svuint32_t svaddqp_u32(svuint32_t, svuint32_t) / svuint32_t svaddqp(svuint32_t, svuint32_t) - svint64_t svaddqp_s64(svint64_t, svint64_t) / svint64_t svaddqp(svint64_t, svint64_t) - svuint64_t svaddqp_u64(svuint64_t, svuint64_t) / svuint64_t svaddqp(svuint64_t, svuint64_t) - ADDSUBP (Add and subtract pairwise) - svint8_t svaddsubp_s8(svint8_t, svint8_t) / svint8_t svaddsubp(svint8_t, svint8_t) - svuint8_t svaddsubp_u8(svuint8_t, svuint8_t) / svuint8_t svaddsubp(svuint8_t, svuint8_t) - svint16_t svaddsubp_s16(svint16_t, svint16_t) / svint16_t svaddsubp(svint16_t, svint16_t) - svuint16_t svaddsubp_u16(svuint16_t, svuint16_t) / svuint16_t svaddsubp(svuint16_t, svuint16_t) - svint32_t svaddsubp_s32(svint32_t, svint32_t) / svint32_t svaddsubp(svint32_t, svint32_t) - svuint32_t svaddsubp_u32(svuint32_t, svuint32_t) / svuint32_t svaddsubp(svuint32_t, svuint32_t) - svint64_t svaddsubp_s64(svint64_t, svint64_t) / svint64_t svaddsubp(svint64_t, svint64_t) - svuint64_t svaddsubp_u64(svuint64_t, svuint64_t) / svuint64_t svaddsubp(svuint64_t, svuint64_t) - SUBP (Subtract pairwise) - svint8_t svsubp_s8(svbool_t, svint8_t, svint8_t) / svint8_t svsubp(svbool_t, svint8_t, svint8_t) - svuint8_t svsubp_u8(svbool_t, svuint8_t, svuint8_t) / svuint8_t svsubp(svbool_t, svuint8_t, svuint8_t) - svint16_t svsubp_s16(svbool_t, svint16_t, svint16_t) / svint16_t svsubp(svbool_t, svint16_t, svint16_t) - svuint16_t svsubp_u16(svbool_t, svuint16_t, svuint16_t) / svuint16_t svsubp(svbool_t, svuint16_t, svuint16_t) - svint32_t svsubp_s32(svbool_t, svint32_t, svint32_t) / svint32_t svsubp(svbool_t, svint32_t, svint32_t) - svuint32_t svsubp_u32(svbool_t, svuint32_t, svuint32_t) / svuint32_t svsubp(svbool_t, svuint32_t, svuint32_t) - svint64_t svsubp_s64(svbool_t, svint64_t, svint64_t) / svint64_t svsubp(svbool_t, svint64_t, svint64_t) - svuint64_t svsubp_u64(svbool_t, svuint64_t, svuint64_t) / svuint64_t svsubp(svbool_t, svuint64_t, svuint64_t)
2c18705 to
25b11e0
Compare
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s | ||
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve2p3 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s |
There was a problem hiding this comment.
-enable-subreg-liveness can be removed from the -force-streaming tests here and in the other files below.
Add the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics)
ADDQP (Add pairwise within quadword vector segments)
ADDSUBP (Add and subtract pairwise)
SUBP (Subtract pairwise)