qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v12 4/5] softmmu/dirtylimit: implement virtual CPU throttle


From: Hyman Huang
Subject: Re: [PATCH v12 4/5] softmmu/dirtylimit: implement virtual CPU throttle
Date: Tue, 8 Feb 2022 19:21:37 +0800
User-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Thunderbird/91.5.1



在 2022/2/8 16:59, Peter Xu 写道:
On Mon, Jan 24, 2022 at 10:10:39PM +0800, huangy81@chinatelecom.cn wrote:
From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>

Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is enabled.

Start a thread to track current dirty page rates and
tune the throttle_us_per_full dynamically untill current
dirty page rate reach the quota.

Introduce the util function in the header for dirtylimit
implementation.

Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
---
  accel/kvm/kvm-all.c         |  13 ++
  accel/stubs/kvm-stub.c      |   5 +
  include/hw/core/cpu.h       |   6 +
  include/sysemu/dirtylimit.h |  16 +++
  include/sysemu/kvm.h        |   2 +
  softmmu/dirtylimit.c        | 308 ++++++++++++++++++++++++++++++++++++++++++++
  softmmu/trace-events        |   8 ++
  7 files changed, 358 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 1a5f1d1..60f51fd 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
  #include "qemu/guest-random.h"
  #include "sysemu/hw_accel.h"
  #include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
#include "hw/boards.h" @@ -476,6 +477,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
      cpu->kvm_state = s;
      cpu->vcpu_dirty = true;
      cpu->dirty_pages = 0;
+    cpu->throttle_us_per_full = 0;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
      if (mmap_size < 0) {
@@ -1469,6 +1471,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
           */
          sleep(1);
+ /* keep sleeping in order to not interfere the dirtylimit */
+        if (dirtylimit_in_service()) {
+            continue;
+        }
+
          trace_kvm_dirty_ring_reaper("wakeup");
          r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
@@ -2312,6 +2319,11 @@ bool kvm_dirty_ring_enabled(void)
      return kvm_state->kvm_dirty_ring_size ? true : false;
  }
+uint32_t kvm_dirty_ring_size(void)
+{
+    return kvm_state->kvm_dirty_ring_size;
+}

Please consider moving this into a small patch too along with the stub.
Ok

+
  static int kvm_init(MachineState *ms)
  {
      MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2961,6 +2973,7 @@ int kvm_cpu_exec(CPUState *cpu)
              qemu_mutex_lock_iothread();
              kvm_dirty_ring_reap(kvm_state, cpu);
              qemu_mutex_unlock_iothread();
+            dirtylimit_vcpu_execute(cpu);
              ret = 0;
              break;
          case KVM_EXIT_SYSTEM_EVENT:
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5319573..1128cb2 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -152,4 +152,9 @@ bool kvm_dirty_ring_enabled(void)
  {
      return false;
  }
+
+uint32_t kvm_dirty_ring_size(void)
+{
+    return 0;
+}
  #endif
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 76ab3b8..dbeb31a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@ struct CPUState {
       */
      bool throttle_thread_scheduled;
+ /*
+     * Sleep throttle_us_per_full microseconds once dirty ring is full
+     * if dirty page rate limit is enabled.
+     */
+    int64_t throttle_us_per_full;
+
      bool ignore_memory_transaction_failures;
/* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f0..37e634b 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,20 @@ void vcpu_dirty_rate_stat_start(void);
  void vcpu_dirty_rate_stat_stop(void);
  void vcpu_dirty_rate_stat_initialize(void);
  void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+void dirtylimit_thread_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_start(void);
+void dirtylimit_stop(void);
+void dirtylimit_set_vcpu(int cpu_index,
+                         uint64_t quota,
+                         bool enable);
+void dirtylimit_set_all(uint64_t quota,
+                        bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
  #endif
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 6eb39a0..bc3f0b5 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -563,4 +563,6 @@ bool kvm_cpu_check_are_resettable(void);
  bool kvm_arch_cpu_check_are_resettable(void);
bool kvm_dirty_ring_enabled(void);
+
+uint32_t kvm_dirty_ring_size(void);
  #endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index a10ac6f..cf20020 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -18,6 +18,32 @@
  #include "sysemu/dirtylimit.h"
  #include "exec/memory.h"
  #include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
+/*
+ * Dirty page rate error greater than
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK should use
+ * linear adjustment police.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK  400  /* MB/s */

Is this magic value really useful?

Asked since it smells like a workaround to avoid oscillation, at the meantime,
AFAICT it's never used, see [1] below.

Ok, my fault, i'll remove this if it doesn't help avoiding oscillation
Again, I really hope we use as less magic values as possible.  Piling up magic
code (without solid proof that it'll help) is scary..

+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
struct {
      VcpuStat stat;
@@ -25,6 +51,31 @@ struct {
      QemuThread thread;
  } *vcpu_dirty_rate_stat;
+typedef struct VcpuDirtyLimitState {
+    int cpu_index;
+    bool enabled;
+    /*
+     * Quota dirty page rate, unit is MB/s
+     * zero if not enabled.
+     */
+    uint64_t quota;
+} VcpuDirtyLimitState;
+
+struct {
+    VcpuDirtyLimitState *states;
+    /* Max cpus number configured by user */
+    int max_cpus;
+    /* Number of vcpu under dirtylimit */
+    int limited_nvcpu;
+    QEMUBH *bh;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
  static void vcpu_dirty_rate_stat_collect(void)
  {
      int64_t start_time;
@@ -58,6 +109,7 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
          vcpu_dirty_rate_stat_collect();
+        qemu_bh_schedule(dirtylimit_state->bh);

When I mentioned "a hook" previously, I meant something like a function
pointer.  Even simpler, IMHO you could call dirtylimit code directly, assuming
this thread only services dirty limit.

Or is this BH required for some reason?

      }
/* stop log sync */
@@ -118,3 +170,259 @@ void vcpu_dirty_rate_stat_finalize(void)
      free(vcpu_dirty_rate_stat);
      vcpu_dirty_rate_stat = NULL;
  }
+
+void dirtylimit_state_lock(void)
+{
+    qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+    qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+    qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+    return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int max_cpus = ms->smp.max_cpus;
+    int i;
+
+    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+    dirtylimit_state->states =
+            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_state->states[i].cpu_index = i;
+    }
+
+    dirtylimit_state->max_cpus = max_cpus;
+    trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+    free(dirtylimit_state->states);
+    dirtylimit_state->states = NULL;
+
+    free(dirtylimit_state);
+    dirtylimit_state = NULL;
+
+    trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+    return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+
+    return !(cpu_index < 0 ||
+             cpu_index >= ms->smp.max_cpus);
+}
+
+static inline void dirtylimit_vcpu_set_quota(int cpu_index,
+                                             uint64_t quota,
+                                             bool on)
+{
+    if (on) {
+        dirtylimit_state->states[cpu_index].quota = quota;
+        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+            dirtylimit_state->limited_nvcpu++;
+        }
+    } else {
+        dirtylimit_state->states[cpu_index].quota = 0;
+        if (dirtylimit_state->states[cpu_index].enabled) {
+            dirtylimit_state->limited_nvcpu--;
+        }
+    }
+
+    dirtylimit_state->states[cpu_index].enabled = on;
+}
+
+static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+    static uint64_t max_dirtyrate;
+    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+    uint64_t dirty_ring_size_meory_MB =
+        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
+
+    if (max_dirtyrate < dirtyrate) {
+        max_dirtyrate = dirtyrate;
+    }
+
+    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+                                   uint64_t current)
+{
+    uint64_t min, max;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+                                  uint64_t current)
+{
+    uint64_t min, max, pct, error;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    error = (max - min);
+
+    if (error < DIRTYLIMIT_LINEAR_ADJUSTMENT_WATERMARK) {
+        false;

[1]

I think you wanted to use "return false"... Maybe this also means the macro
never helped at all hence it can be dropped.
Yes, :( dropping this block can be a good choice.

+    }
+
+    pct = error * 100 / max;
+
+    return pct > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}

Thanks,


--
Best regard

Hyman Huang(黄勇)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]