qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH 3/3] spapr: implement nested-hv support for the TCG virtu


From: Cédric Le Goater
Subject: Re: [RFC PATCH 3/3] spapr: implement nested-hv support for the TCG virtual hypervisor
Date: Mon, 14 Feb 2022 19:31:44 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.3.0

On 2/10/22 07:53, Nicholas Piggin wrote:
This implements the nested-hv hcall API for spapr under TCG.
It's still a bit rough around the edges, concept seems to work.

Some HV exceptions can be raised now in the TCG spapr machine when
running a nested guest. The main ones are the lev==1 syscall, the
hdecr, hdsi and hisi, and h_virt external interrupts. These are
dealt with in the interrupt delivery code by noticing MSR[HV] raised
and instead of switching the machine to HV mode, it exits the
H_ENTER_NESTED hcall with the interrupt vector as return value as
required by the hcall API.

Address translation is provided by the 2-level page table walker
that is implemented for the pnv machine. The partition scope page
table is pointed to the L1's partition scope, and a few tests have
to take into account that nested-hv translations are 2-level. This
could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
but it's surprisingly little code.

There is no TLB tagging between L1 and L2 translations at the moment
so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).

XXX: stop doing atomic RC on page table walks (not for nested but in general)

not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
  hw/ppc/ppc.c               |  20 +++
  hw/ppc/spapr.c             |  16 ++
  hw/ppc/spapr_caps.c        |   5 +-
  hw/ppc/spapr_hcall.c       | 316 +++++++++++++++++++++++++++++++++++++
  include/hw/ppc/ppc.h       |   3 +
  include/hw/ppc/spapr.h     |  75 ++++++++-
  target/ppc/cpu.h           |   6 +
  target/ppc/excp_helper.c   |  60 ++++---
  target/ppc/helper_regs.c   |   1 +
  target/ppc/mmu-book3s-v3.c |  20 ++-
  target/ppc/mmu-radix64.c   |  15 +-
  11 files changed, 499 insertions(+), 38 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index a7c262db93..135900a6f4 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t 
freq)
      return &cpu_ppc_set_tb_clk;
  }
+void cpu_ppc_hdecr_init (CPUPPCState *env)
+{
+    PowerPCCPU *cpu = env_archcpu(env);
+
+    assert(env->tb_env->hdecr_timer == NULL);
+
+    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 
&cpu_ppc_hdecr_cb,
+                                                 cpu);
+}
+
+void cpu_ppc_hdecr_exit (CPUPPCState *env)
+{
+    PowerPCCPU *cpu = env_archcpu(env);
+
+    timer_free(env->tb_env->hdecr_timer);
+    env->tb_env->hdecr_timer = NULL;
+
+    cpu_ppc_hdecr_lower(cpu);
+}


So these are called every time a L2 enters or exits ?

  /* Specific helpers for POWER & PowerPC 601 RTC */
  void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
  {
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..f0c3f726f2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1273,6 +1273,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor 
*vhyp,
      if (msr_pr) {
          hcall_dprintf("Hypercall made with MSR[PR]=1\n");
          env->gpr[3] = H_PRIVILEGE;
+    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
+        spapr_enter_nested(cpu);

Can not this be in the hcall table ?

      } else {
          env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
      }
@@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
      return NULL;
  }
+static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
+{
+    return cpu->in_spapr_nested;
+}

This handler is not very much used.

+static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid)
+{
+        SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+        return spapr->nested_ptcr;
+}
+
  static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
  {
      SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
@@ -4573,6 +4586,9 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
      fwc->get_dev_path = spapr_get_fw_dev_path;
      nc->nmi_monitor_handler = spapr_nmi;
      smc->phb_placement = spapr_phb_placement;
+    vhc->cpu_in_nested = spapr_cpu_in_nested;
+    vhc->get_nested_ptcr = spapr_get_nested_ptcr;
+    vhc->exit_nested = spapr_exit_nested;
      vhc->hypercall = emulate_spapr_hypercall;
      vhc->hpt_mask = spapr_hpt_mask;
      vhc->map_hptes = spapr_map_hptes;
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index ed7c077a0d..a665245f6f 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -450,10 +450,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState 
*spapr,
          return;
      }
- if (tcg_enabled()) {
-        error_setg(errp, "No Nested KVM-HV support in TCG");
-        error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n");
-    } else if (kvm_enabled()) {
+    if (!tcg_enabled()) {
          if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
                                spapr->max_compat_pvr)) {
              error_setg(errp, "Nested KVM-HV only supported on POWER9");
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 222c1b6bbd..8ffb13ada0 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -9,6 +9,7 @@
  #include "qemu/error-report.h"
  #include "exec/exec-all.h"
  #include "helper_regs.h"
+#include "hw/ppc/ppc.h"
  #include "hw/ppc/spapr.h"
  #include "hw/ppc/spapr_cpu_core.h"
  #include "mmu-hash64.h"
@@ -1497,6 +1498,317 @@ static void hypercall_register_softmmu(void)
  }
  #endif
+/* TCG only */
+#define PRTS_MASK      0x1f
+
+static target_ulong h_set_ptbl(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    target_ulong ptcr = args[0];
+
+    if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) {
+        return H_FUNCTION;
+    }
+
+    if ((ptcr & PRTS_MASK) + 12 - 4 > 12) {
+        return H_PARAMETER;
+    }
+
+    spapr->nested_ptcr = ptcr; /* Save new partition table */
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_tlb_invalidate(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    CPUState *cs = CPU(cpu);
+
+    /*
+     * The spapr virtual hypervisor nested HV implementation retains no
+     * translation state except for TLB. This might be optimised to
+     * invalidate fewer entries, but at the moment it's not important
+     * because L1<->L2 transitions always flush the entire TLB for now.
+     */
+    tlb_flush(cs);
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    /*
+     * This HCALL is not required, L1 KVM will take a slow path and walk the
+     * page tables manually to do the data copy.
+     */
+    return H_FUNCTION;
+}
+
+void spapr_enter_nested(PowerPCCPU *cpu)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    target_ulong hv_ptr = env->gpr[4];
+    target_ulong regs_ptr = env->gpr[5];
+    target_ulong hdec, now = cpu_ppc_load_tbl(env);
+    struct kvmppc_hv_guest_state *hvstate;
+    struct kvmppc_hv_guest_state hv_state;
+    struct kvmppc_pt_regs *regs;
+    hwaddr len;
+    uint32_t cr;
+    int i;
+
+    if (cpu->in_spapr_nested) {
+        env->gpr[3] = H_FUNCTION;
+        return;
+    }
+    if (spapr->nested_ptcr == 0) {
+        env->gpr[3] = H_NOT_AVAILABLE;
+        return;
+    }
+
+    len = sizeof(*hvstate);
+ hvstate = cpu_physical_memory_map(hv_ptr, &len,

Are you writing to the state ? address_space_map() is a better pratice.


+    if (!hvstate || len != sizeof(*hvstate)) {
+        env->gpr[3] = H_PARAMETER;
+        return;
+    }
+
+    memcpy(&hv_state, hvstate, len);
+
+    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len 
*/);
+
+    if (hv_state.version != HV_GUEST_STATE_VERSION) {
+        env->gpr[3] = H_PARAMETER;
+        return;
+    }
+
+    cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState));
+    if (!cpu->nested_host_state) {
+        env->gpr[3] = H_NO_MEM;
+        return;
+    }
+
+    memcpy(cpu->nested_host_state, env, sizeof(CPUPPCState));
+
+    len = sizeof(*regs);
+    regs = cpu_physical_memory_map(regs_ptr, &len, true);
+    if (!regs || len != sizeof(*regs)) {
+        g_free(cpu->nested_host_state);
+        env->gpr[3] = H_P2;
+        return;
+    }
+
+    len = sizeof(env->gpr);
+    assert(len == sizeof(regs->gpr));
+    memcpy(env->gpr, regs->gpr, len);
+
+    env->lr = regs->link;
+    env->ctr = regs->ctr;
+    cpu_write_xer(env, regs->xer);
+
+    cr = regs->ccr;
+    for (i = 7; i >= 0; i--) {
+        env->crf[i] = cr & 15;
+        cr >>= 4;
+    }
+
+    env->msr = regs->msr;
+    env->nip = regs->nip;
+
+    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
+
+    env->cfar = hv_state.cfar;
+
+    assert(env->spr[SPR_LPIDR] == 0);
+    env->spr[SPR_LPCR] = hv_state.lpcr & pcc->lpcr_mask; // XXX any other mask?
+    env->spr[SPR_LPIDR] = hv_state.lpid;
+    env->spr[SPR_PCR] = hv_state.pcr;
+//    env->spr[SPR_AMOR] = hv_state.amor;
+    env->spr[SPR_DPDES] = hv_state.dpdes;
+    env->spr[SPR_HFSCR] = hv_state.hfscr;
+    hdec = hv_state.hdec_expiry - now;
+    env->tb_env->tb_offset += hv_state.tb_offset; // XXX how to deal?
+    // dec already set
+//  DAWRetc, CIABR, [S]PURR, IC
+    env->spr[SPR_VTB] = hv_state.vtb;
+//    env->spr[SPR_HEIR] = hv_state.heir;
+    env->spr[SPR_SRR0] = hv_state.srr0;
+    env->spr[SPR_SRR1] = hv_state.srr1;
+    env->spr[SPR_SPRG0] = hv_state.sprg[0];
+    env->spr[SPR_SPRG1] = hv_state.sprg[1];
+    env->spr[SPR_SPRG2] = hv_state.sprg[2];
+    env->spr[SPR_SPRG3] = hv_state.sprg[3];
+    env->spr[SPR_BOOKS_PID] = hv_state.pidr;
+    env->spr[SPR_PPR] = hv_state.ppr;
+
+    cpu_ppc_hdecr_init(env);
+    cpu_ppc_store_hdecr(env, hdec);
+
+    /*
+     * The hv_state.vcpu_token is not needed. It is used by the KVM
+     * implementation to remember which L2 vCPU last ran on which physical
+     * CPU so as to invalidate process scope translations if it is moved
+     * between physical CPUs. For now TLBs are always flushed on L1<->L2
+     * transitions so this is not a problem.
+     *
+     * Could validate that the same vcpu_token does not attempt to run on
+     * different L1 vCPUs at the same time, but that would be a L1 KVM bug
+     * and it's not obviously worth a new data structure to do it.
+     */
+
+    cpu->in_spapr_nested = true;
+
+    hreg_compute_hflags(env);
+    tlb_flush(cs);
+}
+
+void spapr_exit_nested(PowerPCCPU *cpu, int excp)
+{
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    target_ulong r3_return = env->excp_vectors[excp]; // hcall return value
+    target_ulong hv_ptr = cpu->nested_host_state->gpr[4];
+    target_ulong regs_ptr = cpu->nested_host_state->gpr[5];
+    struct kvmppc_hv_guest_state *hvstate;
+    struct kvmppc_pt_regs *regs;
+    hwaddr len;
+    int i;
+
+    assert(cpu->in_spapr_nested);
+    cpu->in_spapr_nested = false;
+
+    cpu_ppc_hdecr_exit(env);
+
+    len = sizeof(*hvstate);
+    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
+    if (!hvstate || len != sizeof(*hvstate)) {
+        r3_return = H_PARAMETER;
+        goto out_restore_l1;
+    }
+
+    //XXX check linux kvm nested CTRL reg bug?
+    //
+    env->tb_env->tb_offset -= hvstate->tb_offset;
+
+    hvstate->cfar = env->cfar;
+    hvstate->lpcr = env->spr[SPR_LPCR];
+    hvstate->pcr = env->spr[SPR_PCR];
+//    hvstate->amor = env->spr[SPR_AMOR];
+    hvstate->dpdes = env->spr[SPR_DPDES];
+    hvstate->hfscr = env->spr[SPR_HFSCR];
+//    hvstate-> = // dec already set
+//  DAWRetc, CIABR, [S]PURR, IC
+    hvstate->vtb = env->spr[SPR_VTB];
+
+    if (excp == POWERPC_EXCP_HDSI) {
+        hvstate->hdar = env->spr[SPR_HDAR];
+        hvstate->hdsisr = env->spr[SPR_HDSISR];
+        hvstate->asdr = env->spr[SPR_ASDR];
+    } else if (excp == POWERPC_EXCP_HISI) {
+        hvstate->asdr = env->spr[SPR_ASDR];
+    }
+
+//    hvstate->heir = env->spr[SPR_HEIR]; XXX HEIR?
+    hvstate->srr0 = env->spr[SPR_SRR0];
+    hvstate->srr1 = env->spr[SPR_SRR1];
+    hvstate->sprg[0] = env->spr[SPR_SPRG0];
+    hvstate->sprg[1] = env->spr[SPR_SPRG1];
+    hvstate->sprg[2] = env->spr[SPR_SPRG2];
+    hvstate->sprg[3] = env->spr[SPR_SPRG3];
+    hvstate->pidr = env->spr[SPR_BOOKS_PID];
+    hvstate->ppr = env->spr[SPR_PPR];
+
+    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len 
*/);
+
+    len = sizeof(*regs);
+    regs = cpu_physical_memory_map(regs_ptr, &len, true);
+    if (!regs || len != sizeof(*regs)) {
+        r3_return = H_P2;
+        goto out_restore_l1;
+    }
+
+    len = sizeof(env->gpr);
+    assert(len == sizeof(regs->gpr));
+    memcpy(regs->gpr, env->gpr, len);
+
+    regs->link = env->lr;
+    regs->ctr = env->ctr;
+    regs->xer = cpu_read_xer(env);
+
+    regs->ccr = 0;
+    for (i = 0; i < 8; i++) {
+        regs->ccr |= (env->crf[i] & 15) << (4 * (7 - i));
+    }
+
+    if (excp == POWERPC_EXCP_MCHECK ||
+        excp == POWERPC_EXCP_RESET ||
+        excp == POWERPC_EXCP_SYSCALL) {
+        regs->nip = env->spr[SPR_SRR0];
+        regs->msr = env->spr[SPR_SRR1];
+    } else {
+        regs->nip = env->spr[SPR_HSRR0];
+        regs->msr = env->spr[SPR_HSRR1];
+    }
+    // XXX must msr be masked?
+
+    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
+
+out_restore_l1:
+    memcpy(env->gpr, cpu->nested_host_state->gpr, sizeof(env->gpr));
+    env->lr = cpu->nested_host_state->lr;
+    env->ctr = cpu->nested_host_state->ctr;
+    memcpy(env->crf, cpu->nested_host_state->crf, sizeof(env->crf));
+    env->cfar = cpu->nested_host_state->cfar;
+    env->xer = cpu->nested_host_state->xer;
+    env->so = cpu->nested_host_state->so;
+    env->ov = cpu->nested_host_state->ov;
+    env->ov32 = cpu->nested_host_state->ov32;
+    env->ca32 = cpu->nested_host_state->ca32;
+    env->msr = cpu->nested_host_state->msr;
+    env->nip = cpu->nested_host_state->nip;
+
+    assert(env->spr[SPR_LPIDR] != 0);
+    env->spr[SPR_LPCR] = cpu->nested_host_state->spr[SPR_LPCR];
+    env->spr[SPR_LPIDR] = cpu->nested_host_state->spr[SPR_LPIDR];
+    env->spr[SPR_PCR] = cpu->nested_host_state->spr[SPR_PCR];
+//    env->spr[SPR_AMOR] = cpu->nested_host_state->spr[SPR_AMOR];
+    env->spr[SPR_DPDES] = 0;
+    env->spr[SPR_HFSCR] = cpu->nested_host_state->spr[SPR_HFSCR];
+//  DAWRetc, CIABR, [S]PURR, IC
+    env->spr[SPR_VTB] = cpu->nested_host_state->spr[SPR_VTB];
+//    env->spr[SPR_HEIR] = hv_state.heir;
+    env->spr[SPR_SRR0] = cpu->nested_host_state->spr[SPR_SRR0];
+    env->spr[SPR_SRR1] = cpu->nested_host_state->spr[SPR_SRR1];
+    env->spr[SPR_SPRG0] = cpu->nested_host_state->spr[SPR_SPRG0];
+    env->spr[SPR_SPRG1] = cpu->nested_host_state->spr[SPR_SPRG1];
+    env->spr[SPR_SPRG2] = cpu->nested_host_state->spr[SPR_SPRG2];
+    env->spr[SPR_SPRG3] = cpu->nested_host_state->spr[SPR_SPRG3];
+    env->spr[SPR_BOOKS_PID] = cpu->nested_host_state->spr[SPR_BOOKS_PID];
+    env->spr[SPR_PPR] = cpu->nested_host_state->spr[SPR_PPR];
+
+    g_free(cpu->nested_host_state);
+    cpu->nested_host_state = NULL;
+
+    /*
+     * Return the interrupt vector address from H_ENTER_NESTED to the L1
+     * (or error code).
+     */
+    env->gpr[3] = r3_return;
+
+    hreg_compute_hflags(env);
+    tlb_flush(cs);
+}
+
  static void hypercall_register_types(void)
  {
      hypercall_register_softmmu();
@@ -1552,6 +1864,10 @@ static void hypercall_register_types(void)
      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
+
+    spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
+    spapr_register_hypercall(KVMPPC_H_TLB_INVALIDATE, h_tlb_invalidate);
+    spapr_register_hypercall(KVMPPC_H_COPY_TOFROM_GUEST, h_copy_tofrom_guest);
  }
type_init(hypercall_register_types)
diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
index 93e614cffd..fcf9e495a0 100644
--- a/include/hw/ppc/ppc.h
+++ b/include/hw/ppc/ppc.h
@@ -54,6 +54,9 @@ struct ppc_tb_t {
uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
+void cpu_ppc_hdecr_init (CPUPPCState *env);
+void cpu_ppc_hdecr_exit (CPUPPCState *env);
+
  /* Embedded PowerPC DCR management */
  typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
  typedef void (*dcr_write_cb)(void *opaque, int dcrn, uint32_t val);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ee7504b976..3a9fa4c024 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -197,6 +197,9 @@ struct SpaprMachineState {
      bool has_graphics;
      uint32_t vsmt;       /* Virtual SMT mode (KVM's "core stride") */
+ /* Nested HV support (TCG only) */
+    uint64_t nested_ptcr;
+
      Notifier epow_notifier;
      QTAILQ_HEAD(, SpaprEventLogEntry) pending_events;
      bool use_hotplug_event_source;
@@ -577,7 +580,14 @@ struct SpaprMachineState {
  #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
  /* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
  #define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
+
+/* Platform-specific hcalls used for nested HV KVM */
+#define KVMPPC_H_SET_PARTITION_TABLE   (KVMPPC_HCALL_BASE + 0x800)
+#define KVMPPC_H_ENTER_NESTED          (KVMPPC_HCALL_BASE + 0x804)
+#define KVMPPC_H_TLB_INVALIDATE        (KVMPPC_HCALL_BASE + 0x808)
+#define KVMPPC_H_COPY_TOFROM_GUEST     (KVMPPC_HCALL_BASE + 0x80C)
+
+#define KVMPPC_HCALL_MAX        KVMPPC_H_COPY_TOFROM_GUEST
/*
   * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
@@ -587,6 +597,65 @@ struct SpaprMachineState {
  #define SVM_H_TPM_COMM              0xEF10
  #define SVM_HCALL_MAX               SVM_H_TPM_COMM
+/*
+ * Register state for entering a nested guest with H_ENTER_NESTED.
+ * New member must be added at the end.
+ */
+struct kvmppc_hv_guest_state {
+    uint64_t version;        /* version of this structure layout, must be 
first */
+    uint32_t lpid;
+    uint32_t vcpu_token;
+    /* These registers are hypervisor privileged (at least for writing) */
+    uint64_t lpcr;
+    uint64_t pcr;
+    uint64_t amor;
+    uint64_t dpdes;
+    uint64_t hfscr;
+    int64_t tb_offset;
+    uint64_t dawr0;
+    uint64_t dawrx0;
+    uint64_t ciabr;
+    uint64_t hdec_expiry;
+    uint64_t purr;
+    uint64_t spurr;
+    uint64_t ic;
+    uint64_t vtb;
+    uint64_t hdar;
+    uint64_t hdsisr;
+    uint64_t heir;
+    uint64_t asdr;
+    /* These are OS privileged but need to be set late in guest entry */
+    uint64_t srr0;
+    uint64_t srr1;
+    uint64_t sprg[4];
+    uint64_t pidr;
+    uint64_t cfar;
+    uint64_t ppr;
+    /* Version 1 ends here */
+    uint64_t dawr1;
+    uint64_t dawrx1;
+    /* Version 2 ends here */
+};
+
+/* Latest version of hv_guest_state structure */
+#define HV_GUEST_STATE_VERSION  2
+
+/* 64-bit powerpc pt_regs struct used by nested HV */
+struct kvmppc_pt_regs {
+    uint64_t gpr[32];
+    uint64_t nip;
+    uint64_t msr;
+    uint64_t orig_gpr3;    /* Used for restarting system calls */
+    uint64_t ctr;
+    uint64_t link;
+    uint64_t xer;
+    uint64_t ccr;
+    uint64_t softe;        /* Soft enabled/disabled */
+    uint64_t trap;         /* Reason for being here */
+    uint64_t dar;          /* Fault registers */
+    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */
+    uint64_t result;       /* Result of a system call */
+};
typedef struct SpaprDeviceTreeUpdateHeader {
      uint32_t version_id;
@@ -604,6 +673,10 @@ typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, 
SpaprMachineState *sm,
  void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
                               target_ulong *args);
+
+void spapr_enter_nested(PowerPCCPU *cpu);
+void spapr_exit_nested(PowerPCCPU *cpu, int excp);
+
  target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState 
*spapr,
                                           target_ulong shift);
  target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState 
*spapr,
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index dcd83b503c..1806a8e776 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1306,6 +1306,9 @@ struct PowerPCCPU {
      bool pre_2_10_migration;
      bool pre_3_0_migration;
      int32_t mig_slb_nr;
+
+    bool in_spapr_nested;
+    CPUPPCState *nested_host_state;
  };
@@ -1316,6 +1319,9 @@ PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc);
  #ifndef CONFIG_USER_ONLY
  struct PPCVirtualHypervisorClass {
      InterfaceClass parent;
+    bool (*cpu_in_nested)(PowerPCCPU *cpu);
+    target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid);
+    void (*exit_nested)(PowerPCCPU *cpu, int excp);
      void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
      hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp);
      const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp,
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index c107953dec..239c253dbc 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -361,8 +361,8 @@ static void ppc_excp_apply_ail(PowerPCCPU *cpu, int 
excp_model, int excp,
  #endif
  }
-static void powerpc_set_excp_state(PowerPCCPU *cpu,
-                                          target_ulong vector, target_ulong 
msr)
+static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp,
+                                   target_ulong vector, target_ulong msr)
  {
      CPUState *cs = CPU(cpu);
      CPUPPCState *env = &cpu->env;
@@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu,
       * will prevent setting of the HV bit which some exceptions might need
       * to do.
       */
-    env->msr = msr & env->msr_mask;
-    hreg_compute_hflags(env);
-    env->nip = vector;
+    if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
+        vhc->exit_nested(cpu, excp);
+    } else {
+        env->nip = vector;
+        env->msr = msr & env->msr_mask;
+        hreg_compute_hflags(env);
+    }
+
      /* Reset exception state */
      cs->exception_index = POWERPC_EXCP_NONE;
      env->error_code = 0;
@@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
      /* Save MSR */
      env->spr[srr1] = msr;
- powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
  }
static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
@@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
      /* Save MSR */
      env->spr[SPR_SRR1] = msr;
- powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
  }
#ifdef TARGET_PPC64
@@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
          env->nip += 4;
/* "PAPR mode" built-in hypercall emulation */
-        if ((lev == 1) && cpu->vhyp) {
+        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {

an helper for (cpu->vhyp && !cpu->in_spapr_nested) would help.

              PPCVirtualHypervisorClass *vhc =
                  PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
              vhc->hypercall(cpu->vhyp, cpu);
@@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
          break;
      }
- /* Sanity check */
-    if (!(env->msr_mask & MSR_HVB)) {
-        if (new_msr & MSR_HVB) {
-            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
-                      "no HV support\n", excp);
-        }
-        if (srr0 == SPR_HSRR0) {
-            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
-                      "no HV support\n", excp);
-        }
-    }
-
      /*
       * Sort out endianness of interrupt, this differs depending on the
       * CPU, the HV mode, etc...
@@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
      /* This can update new_msr and vector if AIL applies */
      ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
- powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
+
+    /* Sanity check */
+    if (!(env->msr_mask & MSR_HVB)) {
+        if (env->msr & MSR_HVB) {
+            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
+                      "no HV support\n", excp);
+        }
+        if (0 && srr0 == SPR_HSRR0) {
+            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
+                      "no HV support\n", excp);
+        }
+    }
  }
  #else
  static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp)
@@ -1517,7 +1525,7 @@ static inline void powerpc_excp_legacy(PowerPCCPU *cpu, 
int excp)
      /* This can update new_msr and vector if AIL applies */
      ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
- powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
  }
static void powerpc_excp(PowerPCCPU *cpu, int excp)
@@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env)
          /* HEIC blocks delivery to the hypervisor */
          if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
              (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
-            powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+            if (cpu->in_spapr_nested) {
+                powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+            } else {
+                powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+            }
              return;
          }
      }
@@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, 
target_ulong vector)
          msr |= (1ULL << MSR_LE);
      }
- powerpc_set_excp_state(cpu, vector, msr);
+    powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
  }
bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index 5b12cb03c9..7f92606522 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
          immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
          dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
      } else {
+        /* Could have nested IDX instead of HV to avoid tlb flush on nested 
enter/exit? */

yes.

          dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0;
          immu_idx = dmmu_idx;
          immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;
diff --git a/target/ppc/mmu-book3s-v3.c b/target/ppc/mmu-book3s-v3.c
index f4985bae78..0810be3668 100644
--- a/target/ppc/mmu-book3s-v3.c
+++ b/target/ppc/mmu-book3s-v3.c
@@ -25,8 +25,23 @@
bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry)
  {
-    uint64_t patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
-    uint64_t pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
+    uint64_t patb, pats;
+
+    if (cpu->vhyp) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        target_ulong nested_ptcr;
+
+        assert(cpu->in_spapr_nested);
+
+        nested_ptcr = vhc->get_nested_ptcr(cpu, lpid);
+
+        patb = nested_ptcr & PTCR_PATB;
+        pats = nested_ptcr & PTCR_PATS;
+    } else {
+        patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
+        pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
+    }
/* Calculate number of entries */
      pats = 1ull << (pats + 12 - 4);
@@ -38,5 +53,6 @@ bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, 
ppc_v3_pate_t *entry)
      patb += 16 * lpid;
      entry->dw0 = ldq_phys(CPU(cpu)->as, patb);
      entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8);
+
      return true;
  }
diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 54fb3ce98d..6304a23d05 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -304,7 +304,7 @@ static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, 
ppc_v3_pate_t *pate)
      if (!(pate->dw0 & PATE0_HR)) {
          return false;
      }
-    if (lpid == 0 && !msr_hv) {
+    if (lpid == 0 && (!msr_hv && !(cpu->vhyp && cpu->in_spapr_nested))) {
          return false;
      }
      if ((pate->dw0 & PATE1_R_PRTS) < 5) {
@@ -336,6 +336,7 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU 
*cpu,
                    g_raddr);
*h_page_size = PRTBE_R_GET_RTS(pate.dw0);
+
      /* No valid pte or access denied due to protection */
      if (ppc_radix64_walk_tree(CPU(cpu)->as, g_raddr, pate.dw0 & PRTBE_R_RPDB,
                                pate.dw0 & PRTBE_R_RPDS, h_raddr, h_page_size,
@@ -389,7 +390,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
      }
      prtbe_addr = (pate.dw1 & PATE1_R_PRTB) + offset;
- if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
          prtbe0 = ldq_phys(cs->as, prtbe_addr);
      } else {
          /*
@@ -415,7 +416,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
      *g_page_size = PRTBE_R_GET_RTS(prtbe0);
      base_addr = prtbe0 & PRTBE_R_RPDB;
      nls = prtbe0 & PRTBE_R_RPDS;
-    if (msr_hv || cpu->vhyp) {
+    if (msr_hv || (cpu->vhyp && !cpu->in_spapr_nested)) {
          /*
           * Can treat process table addresses as real addresses
           */
@@ -519,7 +520,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr 
eaddr,
      relocation = !mmuidx_real(mmu_idx);
/* HV or virtual hypervisor Real Mode Access */
-    if (!relocation && (mmuidx_hv(mmu_idx) || cpu->vhyp)) {
+    if (!relocation && (mmuidx_hv(mmu_idx) || (cpu->vhyp && 
!cpu->in_spapr_nested))) {
          /* In real mode top 4 effective addr bits (mostly) ignored */
          *raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
@@ -553,7 +554,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
      }
/* Get Process Table */
-    if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
          PPCVirtualHypervisorClass *vhc;
          vhc = PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
          vhc->get_pate(cpu->vhyp, &pate);
@@ -596,7 +597,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr 
eaddr,
          g_raddr = eaddr & R_EADDR_MASK;
      }
- if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
          *raddr = g_raddr;
      } else {
          /*
@@ -604,7 +605,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr 
eaddr,
           * quadrants 1 or 2. Translates a guest real address to a host
           * real address.
           */
-        if (lpid || !mmuidx_hv(mmu_idx)) {
+        if (lpid || !mmuidx_hv(mmu_idx) || cpu->in_spapr_nested) {
              int ret;
ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,





reply via email to

[Prev in Thread] Current Thread [Next in Thread]