qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 03/10] hw/rdma: Warn when too many consecutive p


From: Yuval Shaia
Subject: Re: [Qemu-devel] [PATCH 03/10] hw/rdma: Warn when too many consecutive poll CQ triggered on an empty CQ
Date: Wed, 6 Feb 2019 16:59:50 +0200
User-agent: Mutt/1.10.1 (2018-07-13)

On Wed, Feb 06, 2019 at 12:14:24PM +0200, Marcel Apfelbaum wrote:
> Hi Yuval,
> 
> On 1/31/19 3:08 PM, Yuval Shaia wrote:
> > To protect against CPU over utilization when guest performs unneeded
> > busy waiting loop on an empty CQ.
> > 
> > Signed-off-by: Yuval Shaia <address@hidden>
> > ---
> >   hw/rdma/rdma_backend.c      | 11 +++++++----
> >   hw/rdma/rdma_backend.h      |  2 +-
> >   hw/rdma/rdma_rm.c           |  1 +
> >   hw/rdma/rdma_rm_defs.h      |  6 +++++-
> >   hw/rdma/vmw/pvrdma_qp_ops.c | 24 +++++++++++++++++++++++-
> >   5 files changed, 37 insertions(+), 7 deletions(-)
> > 
> > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
> > index 2f6372f8f0..b7d6afb5da 100644
> > --- a/hw/rdma/rdma_backend.c
> > +++ b/hw/rdma/rdma_backend.c
> > @@ -64,9 +64,9 @@ static inline void complete_work(enum ibv_wc_status 
> > status, uint32_t vendor_err,
> >       comp_handler(ctx, &wc);
> >   }
> > -static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq 
> > *ibcq)
> > +static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq 
> > *ibcq)
> >   {
> > -    int i, ne;
> > +    int i, ne, total_ne = 0;
> >       BackendCtx *bctx;
> >       struct ibv_wc wc[2];
> > @@ -76,6 +76,7 @@ static void rdma_poll_cq(RdmaDeviceResources 
> > *rdma_dev_res, struct ibv_cq *ibcq)
> >           trace_rdma_poll_cq(ne, ibcq);
> >           for (i = 0; i < ne; i++) {
> > +            total_ne++;
> 
> It seems 'i' and 'total_ne' hold the same value, do you need them both?

Scope is different, while the scope of 'i' is limited to one ibv_poll_cq
call, the total_ne counts until ibv_poll_cq loop is done.

> 
> >               bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id);
> >               if (unlikely(!bctx)) {
> >                   rdma_error_report("No matching ctx for req %"PRId64,
> > @@ -93,6 +94,8 @@ static void rdma_poll_cq(RdmaDeviceResources 
> > *rdma_dev_res, struct ibv_cq *ibcq)
> >       if (ne < 0) {
> >           rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno);
> >       }
> > +
> > +    return total_ne;
> >   }
> >   static void *comp_handler_thread(void *arg)
> > @@ -267,9 +270,9 @@ int rdma_backend_query_port(RdmaBackendDev *backend_dev,
> >       return 0;
> >   }
> > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ 
> > *cq)
> > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ 
> > *cq)
> >   {
> > -    rdma_poll_cq(rdma_dev_res, cq->ibcq);
> > +    return rdma_poll_cq(rdma_dev_res, cq->ibcq);
> >   }
> >   static GHashTable *ah_hash;
> > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
> > index 5114c90e67..36305cd148 100644
> > --- a/hw/rdma/rdma_backend.h
> > +++ b/hw/rdma/rdma_backend.h
> > @@ -85,7 +85,7 @@ void rdma_backend_destroy_mr(RdmaBackendMR *mr);
> >   int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
> >                              int cqe);
> >   void rdma_backend_destroy_cq(RdmaBackendCQ *cq);
> > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ 
> > *cq);
> > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ 
> > *cq);
> >   int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
> >                              RdmaBackendPD *pd, RdmaBackendCQ *scq,
> > diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
> > index 64c6ea1a4e..1ba77ac42c 100644
> > --- a/hw/rdma/rdma_rm.c
> > +++ b/hw/rdma/rdma_rm.c
> > @@ -261,6 +261,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, 
> > RdmaBackendDev *backend_dev,
> >       if (!cq) {
> >           return -ENOMEM;
> >       }
> > +    atomic_set(&cq->missing_cqe, 0);
> >       cq->opaque = opaque;
> >       cq->notify = CNT_CLEAR;
> > diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
> > index 0ba61d1838..08692e87d4 100644
> > --- a/hw/rdma/rdma_rm_defs.h
> > +++ b/hw/rdma/rdma_rm_defs.h
> > @@ -34,7 +34,9 @@
> >   #define MAX_QP_INIT_RD_ATOM   16
> >   #define MAX_AH                64
> > -#define MAX_RM_TBL_NAME 16
> > +#define MAX_RM_TBL_NAME             16
> > +#define MAX_CONSEQ_EMPTY_POLL_CQ    2048 /* considered as error above this 
> > */
> > +
> >   typedef struct RdmaRmResTbl {
> >       char name[MAX_RM_TBL_NAME];
> >       QemuMutex lock;
> > @@ -59,6 +61,8 @@ typedef struct RdmaRmCQ {
> >       RdmaBackendCQ backend_cq;
> >       void *opaque;
> >       CQNotificationType notify;
> > +    int missing_cqe;
> 
> Maybe cq_empty_hit_cnt ? We don't really have a missing cqe.

Idea here is to count the CQEs we expect to have and still missing so name
is fine.

> 
> > +    int conseq_empty_poll;
> >   } RdmaRmCQ;
> >   /* MR (DMA region) */
> > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
> > index 16db726dac..5d650a4943 100644
> > --- a/hw/rdma/vmw/pvrdma_qp_ops.c
> > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c
> > @@ -60,6 +60,8 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t 
> > cq_handle,
> >           return -EINVAL;
> >       }
> > +    atomic_dec(&cq->missing_cqe);
> > +
> 
> We should set it to 0 here? (If we are counting cq-empty hits)
> 
> >       ring = (PvrdmaRing *)cq->opaque;
> >       /* Step #1: Put CQE on CQ ring */
> > @@ -141,12 +143,15 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t 
> > qp_handle)
> >       PvrdmaRing *ring;
> >       int sgid_idx;
> >       union ibv_gid *sgid;
> > +    RdmaRmCQ *cq;
> >       qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
> >       if (unlikely(!qp)) {
> >           return;
> >       }
> > +    cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->send_cq_handle);
> > +
> >       ring = (PvrdmaRing *)qp->opaque;
> >       wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring);
> > @@ -186,6 +191,7 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
> >               continue;
> >           }
> > +        atomic_inc(&cq->missing_cqe);
> >           rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, 
> > qp->qp_type,
> >                                  (struct ibv_sge *)&wqe->sge[0], 
> > wqe->hdr.num_sge,
> >                                  sgid_idx, sgid,
> > @@ -204,12 +210,15 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t 
> > qp_handle)
> >       RdmaRmQP *qp;
> >       PvrdmaRqWqe *wqe;
> >       PvrdmaRing *ring;
> > +    RdmaRmCQ *cq;
> >       qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
> >       if (unlikely(!qp)) {
> >           return;
> >       }
> > +    cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->recv_cq_handle);
> > +
> >       ring = &((PvrdmaRing *)qp->opaque)[1];
> >       wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring);
> > @@ -231,6 +240,7 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
> >               continue;
> >           }
> > +        atomic_inc(&cq->missing_cqe);
> >           rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res,
> >                                  &qp->backend_qp, qp->qp_type,
> >                                  (struct ibv_sge *)&wqe->sge[0], 
> > wqe->hdr.num_sge,
> > @@ -245,11 +255,23 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t 
> > qp_handle)
> >   void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle)
> >   {
> >       RdmaRmCQ *cq;
> > +    int polled;
> >       cq = rdma_rm_get_cq(dev_res, cq_handle);
> >       if (!cq) {
> >           return;
> >       }
> > -    rdma_backend_poll_cq(dev_res, &cq->backend_cq);
> > +    polled = rdma_backend_poll_cq(dev_res, &cq->backend_cq);
> > +    if (!polled) {
> > +        if (cq->conseq_empty_poll == MAX_CONSEQ_EMPTY_POLL_CQ) {
> > +            rdma_warn_report("%d consequtive empty polls from CQ %d, 
> > missing cqe %d",
> > +                             cq->conseq_empty_poll, cq_handle,
> > +                             atomic_read(&cq->missing_cqe));
> > +            cq->conseq_empty_poll = 0;
> > +        }
> > +        cq->conseq_empty_poll++;
> > +    } else {
> > +        cq->conseq_empty_poll = 0;
> > +    }
> >   }
> 
> So we don't really protect against high CPU usage, we only warn.

Correct, just a warning.
But accepting your suggestion of adding this as another counter and get rid
of this warning.

> Are the both counters interesting?

Both? the 'conseq' and 'missing'?
Idea here is that 'missing' counts the total missing CQEs while conseq
(short name for consecutive) counts the number of times the guest issues
consecutive 'polls' to empty CQ (please note how it is zeroed when poll_cq
returns something).

> 
> Thanks,
> Marcel
> 
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]