[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 3/4] target/arm: Convert PMULL.64 to gvec
From: |
Alex Bennée |
Subject: |
Re: [PATCH 3/4] target/arm: Convert PMULL.64 to gvec |
Date: |
Fri, 18 Oct 2019 14:40:15 +0100 |
User-agent: |
mu4e 1.3.5; emacs 27.0.50 |
Alex Bennée <address@hidden> writes:
> Richard Henderson <address@hidden> writes:
>
>> The gvec form will be needed for implementing SVE2.
>
> Hmm I'm seeing a failure against:
>
> aarch32-all-v80/insn_VMULL__INC.risu.bin
I take it back, after monkey patching cortex-a53 into qemu-arm it
passes.
Reviewed-by: Alex Bennée <address@hidden>
Tested-by: Alex Bennée <address@hidden>
>
> From:
>
> https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH
>
> https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH/download?path=%2F&files=aarch32-all-v80.tar.xz
>
> And some others. But this seems to be broken in master as well so I
> don't know if this is a regression or because I have my -cpu wrong for
> qemu-arm for something recorded on a cortex-a53 in aarch32.
>
>>
>> Signed-off-by: Richard Henderson <address@hidden>
>> ---
>> target/arm/helper.h | 4 +---
>> target/arm/neon_helper.c | 30 ------------------------------
>> target/arm/translate-a64.c | 28 +++-------------------------
>> target/arm/translate.c | 16 ++--------------
>> target/arm/vec_helper.c | 33 +++++++++++++++++++++++++++++++++
>> 5 files changed, 39 insertions(+), 72 deletions(-)
>>
>> diff --git a/target/arm/helper.h b/target/arm/helper.h
>> index 800446e537..d954399b7e 100644
>> --- a/target/arm/helper.h
>> +++ b/target/arm/helper.h
>> @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32,
>> i32, i32)
>> DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
>> DEF_HELPER_2(dc_zva, void, env, i64)
>>
>> -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
>> -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
>> -
>> DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG,
>> void, ptr, ptr, ptr, ptr, i32)
>> DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG,
>> @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void,
>> ptr, ptr, ptr, i32)
>> DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>>
>> DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>>
>> #ifdef TARGET_AARCH64
>> #include "helper-a64.h"
>> diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
>> index 9e7a9a1ac5..6a107da0e1 100644
>> --- a/target/arm/neon_helper.c
>> +++ b/target/arm/neon_helper.c
>> @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm)
>> rm[0] = m0;
>> rd[0] = d0;
>> }
>> -
>> -/* Helper function for 64 bit polynomial multiply case:
>> - * perform PolynomialMult(op1, op2) and return either the top or
>> - * bottom half of the 128 bit result.
>> - */
>> -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
>> -{
>> - int bitnum;
>> - uint64_t res = 0;
>> -
>> - for (bitnum = 0; bitnum < 64; bitnum++) {
>> - if (op1 & (1ULL << bitnum)) {
>> - res ^= op2 << bitnum;
>> - }
>> - }
>> - return res;
>> -}
>> -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
>> -{
>> - int bitnum;
>> - uint64_t res = 0;
>> -
>> - /* bit 0 of op1 can't influence the high 64 bits at all */
>> - for (bitnum = 1; bitnum < 64; bitnum++) {
>> - if (op1 & (1ULL << bitnum)) {
>> - res ^= op2 >> (64 - bitnum);
>> - }
>> - }
>> - return res;
>> -}
>> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
>> index 04e25cfe06..12588d18df 100644
>> --- a/target/arm/translate-a64.c
>> +++ b/target/arm/translate-a64.c
>> @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s,
>> int is_q, int is_u, int size,
>> clear_vec_high(s, is_q, rd);
>> }
>>
>> -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int
>> rm)
>> -{
>> - /* PMULL of 64 x 64 -> 128 is an odd special case because it
>> - * is the only three-reg-diff instruction which produces a
>> - * 128-bit wide result from a single operation. However since
>> - * it's possible to calculate the two halves more or less
>> - * separately we just use two helper calls.
>> - */
>> - TCGv_i64 tcg_op1 = tcg_temp_new_i64();
>> - TCGv_i64 tcg_op2 = tcg_temp_new_i64();
>> - TCGv_i64 tcg_res = tcg_temp_new_i64();
>> -
>> - read_vec_element(s, tcg_op1, rn, is_q, MO_64);
>> - read_vec_element(s, tcg_op2, rm, is_q, MO_64);
>> - gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2);
>> - write_vec_element(s, tcg_res, rd, 0, MO_64);
>> - gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2);
>> - write_vec_element(s, tcg_res, rd, 1, MO_64);
>> -
>> - tcg_temp_free_i64(tcg_op1);
>> - tcg_temp_free_i64(tcg_op2);
>> - tcg_temp_free_i64(tcg_res);
>> -}
>> -
>> /* AdvSIMD three different
>> * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
>> * +---+---+---+-----------+------+---+------+--------+-----+------+------+
>> @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext
>> *s, uint32_t insn)
>> if (!fp_access_check(s)) {
>> return;
>> }
>> - handle_pmull_64(s, is_q, rd, rn, rm);
>> + /* The Q field specifies lo/hi half input for this insn. */
>> + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
>> + gen_helper_gvec_pmull_q);
>> return;
>> }
>> goto is_widening;
>> diff --git a/target/arm/translate.c b/target/arm/translate.c
>> index b66a2f6b71..4e34249672 100644
>> --- a/target/arm/translate.c
>> +++ b/target/arm/translate.c
>> @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s,
>> uint32_t insn)
>> * outside the loop below as it only performs a single pass.
>> */
>> if (op == 14 && size == 2) {
>> - TCGv_i64 tcg_rn, tcg_rm, tcg_rd;
>> -
>> if (!dc_isar_feature(aa32_pmull, s)) {
>> return 1;
>> }
>> - tcg_rn = tcg_temp_new_i64();
>> - tcg_rm = tcg_temp_new_i64();
>> - tcg_rd = tcg_temp_new_i64();
>> - neon_load_reg64(tcg_rn, rn);
>> - neon_load_reg64(tcg_rm, rm);
>> - gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm);
>> - neon_store_reg64(tcg_rd, rd);
>> - gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm);
>> - neon_store_reg64(tcg_rd, rd + 1);
>> - tcg_temp_free_i64(tcg_rn);
>> - tcg_temp_free_i64(tcg_rm);
>> - tcg_temp_free_i64(tcg_rd);
>> + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
>> + 0, gen_helper_gvec_pmull_q);
>> return 0;
>> }
>>
>> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
>> index d401282c6f..5c1074374e 100644
>> --- a/target/arm/vec_helper.c
>> +++ b/target/arm/vec_helper.c
>> @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void
>> *vm, uint32_t desc)
>> }
>> clear_tail(d, opr_sz, simd_maxsz(desc));
>> }
>> +
>> +/*
>> + * 64x64->128 polynomial multiply.
>> + * Because of the lanes are not accessed in strict columns,
>> + * this probably cannot be turned into a generic helper.
>> + */
>> +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
>> +{
>> + intptr_t i, j, opr_sz = simd_oprsz(desc);
>> + intptr_t hi = simd_data(desc);
>> + uint64_t *d = vd, *n = vn, *m = vm;
>> +
>> + for (i = 0; i < opr_sz / 8; i += 2) {
>> + uint64_t nn = n[i + hi];
>> + uint64_t mm = m[i + hi];
>> + uint64_t rhi = 0;
>> + uint64_t rlo = 0;
>> +
>> + /* Bit 0 can only influence the low 64-bit result. */
>> + if (nn & 1) {
>> + rlo = mm;
>> + }
>> +
>> + for (j = 1; j < 64; ++j) {
>> + uint64_t mask = -((nn >> j) & 1);
>> + rlo ^= (mm << j) & mask;
>> + rhi ^= (mm >> (64 - j)) & mask;
>> + }
>> + d[i] = rlo;
>> + d[i + 1] = rhi;
>> + }
>> + clear_tail(d, opr_sz, simd_maxsz(desc));
>> +}
--
Alex Bennée
- [PATCH 0/4] target/arm vector improvements, Richard Henderson, 2019/10/17
- [PATCH 2/4] target/arm: Convert PMUL.8 to gvec, Richard Henderson, 2019/10/17
- [PATCH 1/4] target/arm: Vectorize USHL and SSHL, Richard Henderson, 2019/10/17
- [PATCH 3/4] target/arm: Convert PMULL.64 to gvec, Richard Henderson, 2019/10/17
- [PATCH 4/4] target/arm: Convert PMULL.8 to gvec, Richard Henderson, 2019/10/17
- Re: [PATCH 0/4] target/arm vector improvements, no-reply, 2019/10/17
- Re: [PATCH 0/4] target/arm vector improvements, Alex Bennée, 2019/10/18