qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH 1/2] mem: make phys_section and phys_map_nod


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [RFC PATCH 1/2] mem: make phys_section and phys_map_nodes prepared for RCU
Date: Tue, 14 May 2013 11:27:17 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130311 Thunderbird/17.0.4

Il 14/05/2013 05:38, liu ping fan ha scritto:
>>> +struct PhysPageTable {
>>> +    int ref;
>>> +    PhysSection *phys_sections;
>>> +    unsigned phys_sections_nb;
>>> +    unsigned phys_sections_nb_alloc;
>>> +    uint16_t phys_section_unassigned;
>>> +    uint16_t phys_section_notdirty;
>>> +    uint16_t phys_section_rom;
>>> +    uint16_t phys_section_watch;
>>
>> These four should be constants.  Please make them #defines and add
>> assertions that they have the expected values (in a separate patch).
>>
> Things like the following?
> #define  phys_section_unassigned 0
> t = dummy_section(&io_mem_unassigned);
> assert(t ==phys_section_unassigned );

Yes.

>>> +
>>> +    Node *phys_map_nodes;
>>> +    unsigned phys_map_nodes_nb;
>>> +    unsigned phys_map_nodes_nb_alloc;
>>> +};
>>> +static PhysPageTable *cur_pgtbl;
>>> +static PhysPageTable *next_pgtbl;
>>
>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>> the new one.
>>
> Here is a trick. All AddressSpaceDispatch will update its radix tree
> at the same time, so the re-claimer will come after all of them commit
> to drop the old phys_map_node[] which holds the radix trees all at
> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
> own space for radix tree?

I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
make things as streamlined as possible on the read side.

>> The lock must also be taken in address_space_translate, together with a
>> relatively expensive ref/unref pair (two atomic operations).  We
>> probably need real RCU so that we can skip the ref/unref.
>>
> Currently, without RCU, do we rely on as->lock? So no need ref/unref
> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
> mr->ops->read/write can block, and should be excluded from RCU
> section.

Yes, we need ref/unref on the region, but in many cases that will be a
no-op (most important, for access to system RAM).  And if we can do the
dispatch in the RCU section, similar to what I proposed in reviewing
your patch 2, we have no ref/unref in the common case of accessing RAM.

>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>
>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>
>>>  static void phys_map_node_reserve(unsigned nodes)
>>>  {
>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > 
>>> next_pgtbl->phys_map_nodes_nb_alloc) {
>>>          typedef PhysPageEntry Node[L2_SIZE];
>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>> -                                      phys_map_nodes_nb + nodes);
>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>> -                                 phys_map_nodes_nb_alloc);
>>> +        next_pgtbl->phys_map_nodes_nb_alloc = 
>>> MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>> +        next_pgtbl->phys_map_nodes_nb_alloc = 
>>> MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>> +                                      next_pgtbl->phys_map_nodes_nb + 
>>> nodes);
>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, 
>>> next_pgtbl->phys_map_nodes,
>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>      }
>>>  }
>>>
>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>      unsigned i;
>>>      uint16_t ret;
>>>
>>> -    ret = phys_map_nodes_nb++;
>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>      for (i = 0; i < L2_SIZE; ++i) {
>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>      }
>>>      return ret;
>>>  }
>>>
>>> -static void phys_map_nodes_reset(void)
>>> -{
>>> -    phys_map_nodes_nb = 0;
>>> -}
>>> -
>>> -
>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>                                  hwaddr *nb, uint16_t leaf,
>>>                                  int level)
>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, 
>>> hwaddr *index,
>>>
>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>          lp->ptr = phys_map_node_alloc();
>>> -        p = phys_map_nodes[lp->ptr];
>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>          if (level == 0) {
>>>              for (i = 0; i < L2_SIZE; i++) {
>>>                  p[i].is_leaf = 1;
>>> -                p[i].ptr = phys_section_unassigned;
>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>              }
>>>          }
>>>      } else {
>>> -        p = phys_map_nodes[lp->ptr];
>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>      }
>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>
>>> @@ -192,11 +197,13 @@ static PhysSection 
>>> *phys_section_find(AddressSpaceDispatch *d,
>>>  {
>>>      PhysPageEntry lp = d->phys_map;
>>>      PhysPageEntry *p;
>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>      int i;
>>>
>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>> -            return &phys_sections[phys_section_unassigned];
>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>          }
>>>          p = phys_map_nodes[lp.ptr];
>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>> @@ -209,6 +216,7 @@ static MemoryRegionSection 
>>> *address_space_lookup_region(AddressSpace *as,
>>>  {
>>>      PhysSection *psection;
>>>      uint16_t idx;
>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>
>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>      if (psection->sub_section) {
>>> @@ -246,7 +254,7 @@ MemoryRegionSection 
>>> *address_space_translate(AddressSpace *as, hwaddr addr,
>>>                  | (addr & iotlb.addr_mask));
>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>          if (!iotlb.perm[is_write]) {
>>> -            section = &phys_sections[phys_section_unassigned].section;
>>> +            section = 
>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>              break;
>>>          }
>>>
>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState 
>>> *env,
>>>          iotlb = (memory_region_get_ram_addr(section->mr) & 
>>> TARGET_PAGE_MASK)
>>>              + xlat;
>>>          if (!section->readonly) {
>>> -            iotlb |= phys_section_notdirty;
>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>          } else {
>>> -            iotlb |= phys_section_rom;
>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>          }
>>>      } else {
>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, 
>>> section));
>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, 
>>> section), cur_pgtbl);
>>>          iotlb += xlat;
>>>      }
>>>
>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState 
>>> *env,
>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>> -                iotlb = phys_section_watch + paddr;
>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>                  *address |= TLB_MMIO;
>>>                  break;
>>>              }
>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, 
>>> uint32_t start,
>>>                                 uint32_t end, uint16_t section);
>>>  static void subsections_init(PhysSection *psection);
>>>
>>> -static void destroy_page_desc(uint16_t section_index)
>>> +/* Call after all listener has been commit.
>>> +  * we do not walk over tree, just simply drop.
>>> +  */
>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>  {
>>> -    g_free(phys_sections[section_index].sub_section);
>>> -}
>>> -
>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>> -{
>>> -    unsigned i;
>>> -    PhysPageEntry *p;
>>> +    int i;
>>>
>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>> -        return;
>>> -    }
>>> +    g_free(pgtbl->phys_map_nodes);
>>>
>>> -    p = phys_map_nodes[lp->ptr];
>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>> -        if (!p[i].is_leaf) {
>>> -            destroy_l2_mapping(&p[i], level - 1);
>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>          } else {
>>> -            destroy_page_desc(p[i].ptr);
>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>          }
>>>      }
>>> -    lp->is_leaf = 0;
>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>> -}
>>> +    g_free(pgtbl->phys_sections);
>>>
>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>> -{
>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>> -    phys_map_nodes_reset();
>>> +    g_free(pgtbl);
>>>  }
>>
>> This should be a separate patch.
>>
> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
> Why to separate it?

You are changing the algorithm by which the old pagetable is destroyed.
 This applies just as well to the current code.

BTW, your patch includes Jan's subpage patch which was broken.  I'll
drop it from the branch next time I push.

Paolo

>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>> +static uint16_t phys_section_add(MemoryRegionSection *section, 
>>> PhysPageTable *pgtbl)
>>
>> phys_section_add will always add to next_pgtbl.
>>
> Ok, will drop @pgtbl.
>>>  {
>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>
>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>> -                                phys_sections_nb_alloc);
>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc 
>>> * 2, 16);
>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>> +                                pgtbl->phys_sections_nb_alloc);
>>>      }
>>> -    phys_sections[phys_sections_nb].section = *section;
>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>      memory_region_ref(section->mr);
>>> -    return phys_sections_nb++;
>>> -}
>>> -
>>> -static void phys_sections_clear(void)
>>> -{
>>> -    while (phys_sections_nb > 0) {
>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>> -        memory_region_unref(phys_section->section.mr);
>>> -    }
>>> +    return pgtbl->phys_sections_nb++;
>>>  }
>>>
>>>  static void register_subsection(AddressSpaceDispatch *d,
>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch 
>>> *d,
>>>             psection->section.mr == &io_mem_unassigned);
>>>
>>>      if (!psection->sub_section) {
>>> -        new_section = phys_section_add(&subsection);
>>> -        psection = &phys_sections[new_section];
>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>          subsections_init(psection);
>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>      } else {
>>> -        new_section = PHYS_SECTION_ID(psection);
>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>      }
>>>
>>> -    new_subsection = phys_section_add(section);
>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>
>>>      /* phys_section_add invalidates psection, reload it  */
>>> -    psection = &phys_sections[new_section];
>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>      end = start + section->size - 1;
>>>      subsection_register(psection, start, end, new_subsection);
>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, 
>>> MemoryRegionSection *sec
>>>      hwaddr start_addr = section->offset_within_address_space;
>>>      ram_addr_t size = section->size;
>>>      hwaddr addr;
>>> -    uint16_t section_index = phys_section_add(section);
>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>
>>>      assert(size);
>>>
>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>  {
>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>> -                        phys_section_unassigned);
>>> +                        next_pgtbl->phys_section_unassigned);
>>>  }
>>>
>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>          .size = UINT64_MAX,
>>>      };
>>>
>>> -    return phys_section_add(&section);
>>> +    return phys_section_add(&section, next_pgtbl);
>>>  }
>>>
>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>  {
>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>  }
>>>
>>>  static void io_mem_init(void)
>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>                            "watch", UINT64_MAX);
>>>  }
>>>
>>> +void global_begin(void)
>>> +{
>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>> +    next_pgtbl->ref = 1;
>>> +    next_pgtbl->phys_section_unassigned = 
>>> dummy_section(&io_mem_unassigned);
>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>> +}
>>> +
>>> +/* other listeners finished */
>>> +void global_commit(void)
>>> +{
>>> +    PhysPageTable *t = cur_pgtbl;
>>> +
>>> +    cur_pgtbl = next_pgtbl;
>>> +    /* Fix me,  currently, we rely on each address space listener agaist 
>>> its
>>> +      * reader. So when we come here, no readers will touch old 
>>> phys_map_node.
>>> +      * After rcu, should changed to call_rcu()
>>> +      */
>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>> +        destroy_pagetable(t);
>>> +    }
>>
>> global_commit should not be needed, and global_begin should be simply
>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>
> Good idea, thanks.
> 
> Regards,
> Pingfan
>>
>>> +}
>>> +
>>>  static void mem_begin(MemoryListener *listener)
>>>  {
>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, 
>>> listener);
>>>
>>> -    destroy_all_mappings(d);
>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>  }
>>>
>>>  static void core_begin(MemoryListener *listener)
>>>  {
>>> -    phys_sections_clear();
>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>  }
>>>
>>>  static void tcg_commit(MemoryListener *listener)
>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>      AddressSpaceDispatch *d = as->dispatch;
>>>
>>>      memory_listener_unregister(&d->listener);
>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>      g_free(d);
>>>      as->dispatch = NULL;
>>>  }
>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = 
>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>          io_mem_write(section->mr, addr, val, 4);
>>>      } else {
>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = 
>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, 
>>> uint32_t val,
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = 
>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, 
>>> uint32_t val,
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = 
>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>> index b97ace7..cc654fa 100644
>>> --- a/include/exec/memory.h
>>> +++ b/include/exec/memory.h
>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>                           int is_write, hwaddr access_len);
>>>
>>> +void global_begin(void);
>>> +void global_commit(void);
>>>
>>>  #endif
>>>
>>> diff --git a/memory.c b/memory.c
>>> index 1a86607..da06dfd 100644
>>> --- a/memory.c
>>> +++ b/memory.c
>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>      --memory_region_transaction_depth;
>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>          memory_region_update_pending = false;
>>> +        global_begin();
>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>
>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>          }
>>>
>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>> +        global_commit();
>>>      }
>>>  }
>>>
>>>
>>
> 
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]