qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] exec: limit system memory size


From: Michael S. Tsirkin
Subject: Re: [Qemu-devel] [PATCH] exec: limit system memory size
Date: Mon, 4 Nov 2013 13:14:58 +0200

On Mon, Nov 04, 2013 at 11:54:34AM +0100, Paolo Bonzini wrote:
> Il 04/11/2013 11:07, Michael S. Tsirkin ha scritto:
> > On Mon, Nov 04, 2013 at 11:50:05AM +0200, Marcel Apfelbaum wrote:
> >> On Mon, 2013-11-04 at 08:06 +0200, Michael S. Tsirkin wrote:
> >>> The page table logic in exec.c assumes
> >>> that memory addresses are at most TARGET_PHYS_ADDR_SPACE_BITS.
> >>>
> >>> But pci addresses are full 64 bit so if we try to render them ignoring
> >>> the extra bits, we get strange effects with sections overlapping each
> >>> other.
> >>>
> >>> To fix, simply limit the system memory size to
> >>>  1 << TARGET_PHYS_ADDR_SPACE_BITS,
> >>> pci addresses will be rendered within that.
> >>>
> >>> Signed-off-by: Michael S. Tsirkin <address@hidden>
> >>> ---
> >>>  exec.c | 6 +++++-
> >>>  1 file changed, 5 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/exec.c b/exec.c
> >>> index 030118e..c7a8df5 100644
> >>> --- a/exec.c
> >>> +++ b/exec.c
> >>> @@ -1801,7 +1801,12 @@ void address_space_destroy_dispatch(AddressSpace 
> >>> *as)
> >>>  static void memory_map_init(void)
> >>>  {
> >>>      system_memory = g_malloc(sizeof(*system_memory));
> >>> -    memory_region_init(system_memory, NULL, "system", INT64_MAX);
> >>> +
> >>> +    assert(TARGET_PHYS_ADDR_SPACE_BITS <= 64);
> >>> +
> >>> +    memory_region_init(system_memory, NULL, "system",
> >>> +                       TARGET_PHYS_ADDR_SPACE_BITS == 64 ?
> >>> +                       UINT64_MAX : (0x1ULL << 
> >>> TARGET_PHYS_ADDR_SPACE_BITS));
> >>
> >> Michael, thanks again for the help.
> >>
> >> I am concerned that we cannot use all the UINT64_MAX
> >> address space.
> > 
> > Well, exec isn't ready for this, it expects at most
> > TARGET_PHYS_ADDR_SPACE_BITS.
> > Fortunately there's no way for CPU to initiate io outside
> > this area.
> > 
> > So this is another place where device to device IO
> > would be broken.
> 
> However, firmware that places BARs where the CPU cannot access them 
> would be "interesting" to say the least.  This applies both to device-
> device I/O and to non-aligned <4K BARs.
> 
> This patch looks good; however, on top of it can you test 
> kvm-unit-tests with TARGET_PHYS_ADDR_SPACE_BITS=64 and see whether 
> there is a measurable slowdown (in the inl_from_qemu tests)?  If not, 
> we can just get rid of TARGET_PHYS_ADDR_SPACE_BITS in exec.c.

I'd rather we fixed a bug first - we need to fix it on stable too - any
cleanups can come on top.  Also, I'm not sure what will this test tell
us: inl reads io space, not memory, right?

> 
> Note that L2_BITS is shared between translate-all.c and exec.c only for 
> historical reasons (the translate-all.c code used to be in exec.c).  It 
> is probably a good idea to split them like this:

Want to test this and post properly?

> diff --git a/exec.c b/exec.c
> index 2e31ffc..3faea0e 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -88,7 +88,15 @@ struct PhysPageEntry {
>      uint16_t ptr : 15;
>  };
>  
> -typedef PhysPageEntry Node[L2_SIZE];
> +/* Size of the L2 (and L3, etc) page tables.  */
> +#define ADDR_SPACE_BITS 64
> +
> +#define P_L2_BITS 10
> +#define P_L2_SIZE (1 << P_L2_BITS)
> +
> +#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) 
> + 1)
> +
> +typedef PhysPageEntry Node[P_L2_SIZE];
>  
>  struct AddressSpaceDispatch {
>      /* This is a multi-level map on the physical address space.
> @@ -155,7 +163,7 @@ static uint16_t phys_map_node_alloc(void)
>      ret = next_map.nodes_nb++;
>      assert(ret != PHYS_MAP_NODE_NIL);
>      assert(ret != next_map.nodes_nb_alloc);
> -    for (i = 0; i < L2_SIZE; ++i) {
> +    for (i = 0; i < P_L2_SIZE; ++i) {
>          next_map.nodes[ret][i].is_leaf = 0;
>          next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>      }
> @@ -168,13 +176,13 @@ static void phys_page_set_level(PhysPageEntry *lp, 
> hwaddr *index,
>  {
>      PhysPageEntry *p;
>      int i;
> -    hwaddr step = (hwaddr)1 << (level * L2_BITS);
> +    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
>  
>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>          lp->ptr = phys_map_node_alloc();
>          p = next_map.nodes[lp->ptr];
>          if (level == 0) {
> -            for (i = 0; i < L2_SIZE; i++) {
> +            for (i = 0; i < P_L2_SIZE; i++) {
>                  p[i].is_leaf = 1;
>                  p[i].ptr = PHYS_SECTION_UNASSIGNED;
>              }
> @@ -182,9 +190,9 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr 
> *index,
>      } else {
>          p = next_map.nodes[lp->ptr];
>      }
> -    lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
> +    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
>  
> -    while (*nb && lp < &p[L2_SIZE]) {
> +    while (*nb && lp < &p[P_L2_SIZE]) {
>          if ((*index & (step - 1)) == 0 && *nb >= step) {
>              lp->is_leaf = true;
>              lp->ptr = leaf;
> @@ -218,7 +226,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry 
> lp, hwaddr index,
>              return &sections[PHYS_SECTION_UNASSIGNED];
>          }
>          p = nodes[lp.ptr];
> -        lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
> +        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
>      }
>      return &sections[lp.ptr];
>  }
> diff --git a/translate-all.c b/translate-all.c
> index aeda54d..1c63d78 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -96,12 +96,16 @@ typedef struct PageDesc {
>  # define L1_MAP_ADDR_SPACE_BITS  TARGET_VIRT_ADDR_SPACE_BITS
>  #endif
>  
> +/* Size of the L2 (and L3, etc) page tables.  */
> +#define V_L2_BITS 10
> +#define V_L2_SIZE (1 << V_L2_BITS)
> +
>  /* The bits remaining after N lower levels of page tables.  */
>  #define V_L1_BITS_REM \
> -    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
> +    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS)
>  
>  #if V_L1_BITS_REM < 4
> -#define V_L1_BITS  (V_L1_BITS_REM + L2_BITS)
> +#define V_L1_BITS  (V_L1_BITS_REM + V_L2_BITS)
>  #else
>  #define V_L1_BITS  V_L1_BITS_REM
>  #endif
> @@ -395,18 +399,18 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, 
> int alloc)
>      lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
>  
>      /* Level 2..N-1.  */
> -    for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
> +    for (i = V_L1_SHIFT / V_L2_BITS - 1; i > 0; i--) {
>          void **p = *lp;
>  
>          if (p == NULL) {
>              if (!alloc) {
>                  return NULL;
>              }
> -            ALLOC(p, sizeof(void *) * L2_SIZE);
> +            ALLOC(p, sizeof(void *) * V_L2_SIZE);
>              *lp = p;
>          }
>  
> -        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
> +        lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
>      }
>  
>      pd = *lp;
> @@ -414,13 +418,13 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, 
> int alloc)
>          if (!alloc) {
>              return NULL;
>          }
> -        ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
> +        ALLOC(pd, sizeof(PageDesc) * V_L2_SIZE);
>          *lp = pd;
>      }
>  
>  #undef ALLOC
>  
> -    return pd + (index & (L2_SIZE - 1));
> +    return pd + (index & (V_L2_SIZE - 1));
>  }
>  
>  static inline PageDesc *page_find(tb_page_addr_t index)
> @@ -655,14 +659,14 @@ static void page_flush_tb_1(int level, void **lp)
>      if (level == 0) {
>          PageDesc *pd = *lp;
>  
> -        for (i = 0; i < L2_SIZE; ++i) {
> +        for (i = 0; i < V_L2_SIZE; ++i) {
>              pd[i].first_tb = NULL;
>              invalidate_page_bitmap(pd + i);
>          }
>      } else {
>          void **pp = *lp;
>  
> -        for (i = 0; i < L2_SIZE; ++i) {
> +        for (i = 0; i < V_L2_SIZE; ++i) {
>              page_flush_tb_1(level - 1, pp + i);
>          }
>      }
> @@ -673,7 +677,7 @@ static void page_flush_tb(void)
>      int i;
>  
>      for (i = 0; i < V_L1_SIZE; i++) {
> -        page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
> +        page_flush_tb_1(V_L1_SHIFT / V_L2_BITS - 1, l1_map + i);
>      }
>  }
>  
> @@ -1600,7 +1604,7 @@ static int walk_memory_regions_1(struct 
> walk_memory_regions_data *data,
>      if (level == 0) {
>          PageDesc *pd = *lp;
>  
> -        for (i = 0; i < L2_SIZE; ++i) {
> +        for (i = 0; i < V_L2_SIZE; ++i) {
>              int prot = pd[i].flags;
>  
>              pa = base | (i << TARGET_PAGE_BITS);
> @@ -1614,9 +1618,9 @@ static int walk_memory_regions_1(struct 
> walk_memory_regions_data *data,
>      } else {
>          void **pp = *lp;
>  
> -        for (i = 0; i < L2_SIZE; ++i) {
> +        for (i = 0; i < V_L2_SIZE; ++i) {
>              pa = base | ((abi_ulong)i <<
> -                (TARGET_PAGE_BITS + L2_BITS * level));
> +                (TARGET_PAGE_BITS + V_L2_BITS * level));
>              rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
>              if (rc != 0) {
>                  return rc;
> @@ -1639,7 +1643,7 @@ int walk_memory_regions(void *priv, 
> walk_memory_regions_fn fn)
>  
>      for (i = 0; i < V_L1_SIZE; i++) {
>          int rc = walk_memory_regions_1(&data, (abi_ulong)i << V_L1_SHIFT,
> -                                       V_L1_SHIFT / L2_BITS - 1, l1_map + i);
> +                                       V_L1_SHIFT / V_L2_BITS - 1, l1_map + 
> i);
>  
>          if (rc != 0) {
>              return rc;
> diff --git a/translate-all.h b/translate-all.h
> index 5c38819..f7e5932 100644
> --- a/translate-all.h
> +++ b/translate-all.h
> @@ -19,13 +19,6 @@
>  #ifndef TRANSLATE_ALL_H
>  #define TRANSLATE_ALL_H
>  
> -/* Size of the L2 (and L3, etc) page tables.  */
> -#define L2_BITS 10
> -#define L2_SIZE (1 << L2_BITS)
> -
> -#define P_L2_LEVELS \
> -    (((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / L2_BITS) + 1)
> -
>  /* translate-all.c */
>  void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
>  void cpu_unlink_tb(CPUState *cpu);
> 
> Paolo



reply via email to

[Prev in Thread] Current Thread [Next in Thread]