Conversation
…ned/unsigned absolute difference sum and accumulate long ops Add the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics) SABAL (Two-way signed absolute difference sum and accumulate long) - svint16_t svabal[_s16](svint16_t, svint8_t, svint8_t) / svint16_t svabal[_n_s16](svint16_t, svint8_t, int8_t) - svint32_t svabal[_s32](svint32_t, svint16_t, svint16_t) / svint32_t svabal[_n_s32](svint32_t, svint16_t, int16_t) - svint64_t svabal[_s64](svint64_t, svint32_t, svint32_t) / svint64_t svabal[_n_s64](svint64_t, svint32_t, int32_t) UABAL (Two-way unsigned absolute difference sum and accumulate long ) - svuint16_t svabal[_u16](svuint16_t, svuint8_t, svuint8_t) / svuint16_t svabal[_n_u16](svuint16_t, svuint8_t, uint8_t) - svuint32_t svabal[_u32](svuint32_t, svuint16_t, svuint16_t) / svuint32_t svabal[_n_u32](svuint32_t, svuint16_t, uint16_t) - svuint64_t svabal[_u64](svuint64_t, svuint32_t, svuint32_t) / svuint64_t svabal[_n_u64](svuint64_t, svuint32_t, uint32_t)
|
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-aarch64 Author: Amilendra Kodithuwakku (amilendra) ChangesAdd the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics) SABAL (Two-way signed absolute difference sum and accumulate long)
UABAL (Two-way unsigned absolute difference sum and accumulate long )
Patch is 53.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/188972.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index be3cd8a76503b..8ada5878f9852 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1342,6 +1342,17 @@ defm SVRECPE : SInstZPZ<"svrecpe", "Ui", "aarch64_sve_urecpe">;
defm SVRSQRTE : SInstZPZ<"svrsqrte", "Ui", "aarch64_sve_ursqrte">;
}
+////////////////////////////////////////////////////////////////////////////////
+// SVE2.3 - Two-way signed/unsigned absolute difference sum and accumulate long
+
+let SVETargetGuard = "sve2p3|sme2p3", SMETargetGuard = "sve2p3|sme2p3" in {
+ def SVABAL_S : SInst<"svabal[_{d}]", "ddhh", "sil" , MergeNone, "aarch64_sve_sabal", [VerifyRuntimeMode]>;
+ def SVABAL_S_N : SInst<"svabal[_n_{d}]", "ddhR", "sil" , MergeNone, "aarch64_sve_sabal", [VerifyRuntimeMode]>;
+
+ def SVABAL_U : SInst<"svabal[_{d}]", "ddhh", "UsUiUl", MergeNone, "aarch64_sve_uabal", [VerifyRuntimeMode]>;
+ def SVABAL_U_N : SInst<"svabal[_n_{d}]", "ddhR", "UsUiUl", MergeNone, "aarch64_sve_uabal", [VerifyRuntimeMode]>;
+}
+
//------------------------------------------------------------------------------
multiclass SInstZPZxZ<string name, string types, string pat_v, string pat_n, string m_intrinsic, string x_intrinsic, list<FlagType> flags=[]> {
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c
new file mode 100644
index 0000000000000..8519b70bc6260
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_svabal.c
@@ -0,0 +1,479 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+
+#include <arm_sve.h>
+
+#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
+#define ATTR __arm_streaming_compatible
+#elif defined(__ARM_FEATURE_SME)
+#define ATTR __arm_streaming
+#else
+#define ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z15test_svabal_s16u11__SVInt16_tu10__SVInt8_tS0_(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP3]]
+//
+svint16_t test_svabal_s16(svint16_t zda, svint8_t zn, svint8_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svabal,,_s16)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svabal_n_s16(
+// CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z17test_svabal_n_s16u11__SVInt16_tu10__SVInt8_ta(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], i8 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i8, align 1
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 16 x i8> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store i8 [[ZM]], ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 8 x i16>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i8>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ZM_ADDR]], align 1
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP2]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.sabal.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP3]]
+//
+svint16_t test_svabal_n_s16(svint16_t zda, svint8_t zn, int8_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svabal,_n,_s16)(zda, zn, zm);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z15test_svabal_s32u11__SVInt32_tu11__SVInt16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 8 x i16>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
+//
+svint32_t test_svabal_s32(svint32_t zda, svint16_t zn, svint16_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svabal,,_s32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svabal_n_s32(
+// CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svabal_n_s32u11__SVInt32_tu11__SVInt16_ts(
+// CPP-CHECK-SAME: <vscale x 4 x i32> [[ZDA:%.*]], <vscale x 8 x i16> [[ZN:%.*]], i16 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 8 x i16>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i16, align 2
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 8 x i16> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store i16 [[ZM]], ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 8 x i16>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ZM_ADDR]], align 2
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP2]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sabal.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
+//
+svint32_t test_svabal_n_s32(svint32_t zda, svint16_t zn, int16_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svabal,_n,_s32)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z15test_svabal_s64u11__SVInt64_tu11__SVInt32_tS0_(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZM]], ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[ZM_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
+//
+svint64_t test_svabal_s64(svint64_t zda, svint32_t zn, svint32_t zm) ATTR
+{
+ return SVE_ACLE_FUNC(svabal,,_s64)(zda, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svabal_n_s64(
+// CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP2]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sabal.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svabal_n_s64u11__SVInt64_tu11__SVInt32_ti(
+// CPP-CHECK-SAME: <vscale x 2 x i64> [[ZDA:%.*]], <vscale x 4 x i32> [[ZN:%.*]], i32 noundef [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[ZDA_ADDR:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CPP-CHECK-NEXT: [[ZN_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
+// CPP-CHECK-NEXT: [[ZM_ADDR:%.*]] = alloca i32, align 4
+// CPP-CHECK-NEXT: store <vscale x 2 x i64> [[ZDA]], ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: store <vscale x 4 x i32> [[ZN]], ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: store i32 [[ZM]], ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[ZDA_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[ZN_ADDR]], align 16
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ZM_ADDR]], align 4
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale...
[truncated]
|
|
|
||
| #ifdef SVE_OVERLOADED_FORMS | ||
| // A simple used,unused... macro, long enough to represent any SVE builtin. | ||
| #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED) A1 |
There was a problem hiding this comment.
I believe the way you did work, but I would probably do something like#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
#else
#define SVE_ACLE_FUNC(A1,A2) A1##A2
And bellow I would put together the n{d}, like (svabal,_n_s16)
Add the following new clang intrinsics based on the ACLE specification ARM-software/acle#428 (Add alpha support for 9.7 data processing intrinsics)
SABAL (Two-way signed absolute difference sum and accumulate long)
UABAL (Two-way unsigned absolute difference sum and accumulate long )