Re: [Monotone-devel] Speed of Annotate

monotone-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Monotone-devel] Speed of Annotate

From:	Thomas Moschny
Subject:	Re: [Monotone-devel] Speed of Annotate
Date:	Wed, 21 Feb 2007 10:35:36 +0100
User-agent:	KMail/1.9.6

On Tue 2007-02-06, Thomas Moschny wrote:
> So, there was the idea of looking directly at the roster deltas instead.
> The head version of n.v.m.experiment.annotate.roster_deltas does exactly
> that. It introduces a way to extract node-specific changes from a roster
> delta, and uses these methods in the annotate code. It also avoids fetching
> the same information twice, as the old code did.

Attached is the current diff of that branch against mainline. If no one 
objects, I am going to merge it.

- Thomas

# 
# 
# add_dir "tests/annotate_accidental_clean_merge"
# 
# add_file "tests/annotate_accidental_clean_merge/__driver__.lua"
#  content [63b05cf0e5ab7d9455d89b319d505071d9672063]
# 
# patch "annotate.cc"
#  from [5a0e7200fad7865dc1ce9b30aaa531d0f6f7b542]
#    to [27e0f14d529224e51282f455b80a804585aa697b]
# 
# patch "database.cc"
#  from [cac4ab9d4b50bd45c5888e84619bc6f342570367]
#    to [51154627ac739699fe09fe49e7953757d7804478]
# 
# patch "database.hh"
#  from [ea6f156a45cccbf101e7f6befe00c77b8496def5]
#    to [d098635da9410b7c7ecd5a05ead436d51a54a1a3]
# 
# patch "roster_delta.cc"
#  from [66a0a455754209bea533db586769e37b9bd18ca3]
#    to [4d06fa6a95df2ff254ce9572a57bc0b0d1bb7e52]
# 
# patch "roster_delta.hh"
#  from [25e389e2c42e9754480cb7f5e43157623eeaf1fa]
#    to [85506c111153554e2184a41cae920e4198890bfd]
# 
============================================================
--- tests/annotate_accidental_clean_merge/__driver__.lua        
63b05cf0e5ab7d9455d89b319d505071d9672063
+++ tests/annotate_accidental_clean_merge/__driver__.lua        
63b05cf0e5ab7d9455d89b319d505071d9672063
@@ -0,0 +1,64 @@
+-- -*-lua-*-
+-- 
+-- We are testing annotate on this graph:
+-- 
+--       a
+--      /|\
+--     / | \
+--    b* c* d*
+--    |  |  | 
+--    e* e* e*
+--    \   \/
+--     \   e
+--      \ /
+--       e
+--       |
+--       f*
+-- 
+-- The current annotate code will arbitrarily select one of the marked
+-- e-revisions for lines coming from e.
+-- 
+
+mtn_setup()
+
+addfile("foo", "first\nfoo\nthird\n")
+commit()
+rev_a = base_revision()
+
+econtents = "first\nsecond\nthird\n"
+
+writefile("foo", "first\nb\nthird\n")
+commit()
+writefile("foo", econtents)
+commit()
+rev_e1 = base_revision()
+
+revert_to(rev_a)
+writefile("foo", "first\nc\nthird\n")
+commit()
+writefile("foo", econtents)
+commit()
+rev_e2 = base_revision()
+
+revert_to(rev_a)
+writefile("foo", "first\nd\nthird\n")
+commit()
+writefile("foo", econtents)
+commit()
+rev_e3 = base_revision()
+
+check(mtn("merge"), 0, false, false)
+check(mtn("update"), 0, false, false)
+
+writefile("foo", econtents .. "fourth\n")
+commit()
+rev_f = base_revision()
+
+check(mtn("annotate", "foo", "--brief"), 0, true, true)
+
+check(qgrep(rev_a .. ": first", "stdout"))
+check(qgrep(rev_e1 .. ": second", "stdout")
+      or qgrep(rev_e2 .. ": second", "stdout")
+      or qgrep(rev_e3 .. ": second", "stdout"))
+check(qgrep(rev_a .. ": third", "stdout"))
+check(qgrep(rev_f .. ": fourth", "stdout"))
============================================================
--- annotate.cc 5a0e7200fad7865dc1ce9b30aaa531d0f6f7b542
+++ annotate.cc 27e0f14d529224e51282f455b80a804585aa697b
@@ -160,12 +160,18 @@ struct annotate_node_work
   annotate_node_work(shared_ptr<annotate_context> annotations_,
                      shared_ptr<annotate_lineage_mapping> lineage_,
                      revision_id revision_, node_id fid_,
-                     rev_height height_)
+                     rev_height height_,
+                     set<revision_id> interesting_ancestors_,
+                     file_id content_,
+                     bool marked_)
     : annotations(annotations_),
       lineage(lineage_),
       revision(revision_),
       fid(fid_),
-      height(height_)
+      height(height_),
+      interesting_ancestors(interesting_ancestors_),
+      content(content_),
+      marked(marked_)
   {}
 
   shared_ptr<annotate_context> annotations;
@@ -173,10 +179,18 @@ struct annotate_node_work
   revision_id revision;
   node_id fid;
   rev_height height;
+  set<revision_id> interesting_ancestors;
+  file_id content;
+  bool marked;
 };
 
 struct by_rev {};
 
+// instead of using a priority queue and a set to keep track of the already
+// seen revisions, we use a multi index container. it stores work units
+// indexed by both, their revision and their revision's height, with the
+// latter being used by default. usage of that data structure frees us from
+// the burden of keeping two data structures in sync.
 typedef multi_index_container<
   annotate_node_work,
   indexed_by<
@@ -662,74 +676,74 @@ annotate_lineage_mapping::set_copied_all
     }
 }
 
-static void
-do_annotate_node
-(annotate_node_work const & work_unit,
- app_state & app,
- work_units & work_units)
+// fetches the list of file_content markings for the given revision_id and
+// node_id
+static void get_file_content_marks(app_state & app,
+                                   revision_id const & rev,
+                                   node_id const & fid,
+                                   set<revision_id> & content_marks)
 {
-  L(FL("do_annotate_node for node %s") % work_unit.revision);
+  marking_t markings;
+  app.db.get_markings(rev, fid, markings);
 
-  roster_t roster;
-  marking_t marks;
+  I(!markings.file_content.empty());
 
-  {
-    marking_map markmap;
-    app.db.get_roster(work_unit.revision, roster, markmap);
+  content_marks.clear();
+  content_marks.insert(markings.file_content.begin(),
+                       markings.file_content.end());
+}
 
-    map<node_id, marking_t>::const_iterator mmi =
-      markmap.find(work_unit.fid);
-    I(mmi != markmap.end());
-    marks = mmi->second;
-  }
+static void
+do_annotate_node(annotate_node_work const & work_unit,
+                 app_state & app,
+                 work_units & work_units)
+{
+  L(FL("do_annotate_node for node %s") % work_unit.revision);
 
-  I(marks.file_content.size() != 0);
-
-  set<revision_id> parents; // fixme: rename 
-
-  if (marks.file_content.size() == 1
-      && *(marks.file_content.begin()) == work_unit.revision)
-    {
-      // this node is marked, need to inspect the parents
-      app.db.get_revision_parents(work_unit.revision, parents);
-    }
-  else
-    {
-      // jump directly to the marked ancestors
-      parents = marks.file_content;
-    }
-
   size_t added_in_parent_count = 0;
 
-  for (set<revision_id>::const_iterator i = parents.begin();
-       i != parents.end(); i++)
+  for (set<revision_id>::const_iterator i = 
work_unit.interesting_ancestors.begin();
+       i != work_unit.interesting_ancestors.end(); i++)
     {
+      // here, 'parent' means either a real parent or one of the marked
+      // ancestors, depending on whether work_unit.marked is true.
       revision_id parent_revision = *i;
 
-      roster_t parent_roster;
       L(FL("do_annotate_node processing edge from parent %s to child %s")
         % parent_revision % work_unit.revision);
 
       I(!(work_unit.revision == parent_revision));
-      app.db.get_roster(parent_revision, parent_roster);
 
-      if (!parent_roster.has_node(work_unit.fid))
+      file_id file_in_parent;
+      
+      work_units::index<by_rev>::type::iterator lmn =
+        work_units.get<by_rev>().find(parent_revision);
+
+      // find out the content hash of the file in parent.
+      if (lmn != work_units.get<by_rev>().end())
+        // we already got the content hash.
+        file_in_parent = lmn->content;
+      else
         {
+          if (work_unit.marked)
+            app.db.get_file_content(parent_revision, work_unit.fid, 
file_in_parent);
+          else
+            // we are not marked, so parent is marked.
+            file_in_parent = work_unit.content;
+        }
+
+      // stop if file is not present in the parent.
+      if (null_id(file_in_parent))
+        {
           L(FL("file added in %s, continuing") % work_unit.revision);
           added_in_parent_count++;
           continue;
         }
-
-      // The node was live in the parent, so this represents a delta.
-      file_t file_in_child =
-        downcast_to_file_t(roster.get_node(work_unit.fid));
-
-      file_t file_in_parent =
-        downcast_to_file_t(parent_roster.get_node(work_unit.fid));
-
+      
+      // the node was live in the parent, so this represents a delta.
       shared_ptr<annotate_lineage_mapping> parent_lineage;
 
-      if (file_in_parent->content == file_in_child->content)
+      if (file_in_parent == work_unit.content)
         {
           L(FL("parent file identical, "
                "set copied all mapped and copy lineage\n"));
@@ -739,9 +753,9 @@ do_annotate_node
       else
         {
           file_data data;
-          app.db.get_file_version(file_in_parent->content, data);
+          app.db.get_file_version(file_in_parent, data);
           L(FL("building parent lineage for parent file %s")
-            % file_in_parent->content);
+            % file_in_parent);
           parent_lineage
             = work_unit.lineage->build_parent_lineage(work_unit.annotations,
                                                       parent_revision,
@@ -750,26 +764,41 @@ do_annotate_node
 
       // If this parent has not yet been queued for processing, create the
       // work unit for it.
-      work_units::index<by_rev>::type::iterator lmn =
-        work_units.get<by_rev>().find(parent_revision);
+      if (lmn == work_units.get<by_rev>().end())
+        {
+          set<revision_id> parents_interesting_ancestors;
+          bool parent_marked;
 
-      if (lmn == work_units.get<by_rev>().end()) // not yet seen
-        {
-          // Once we move on to processing the parent that this file was
-          // renamed from, we'll need the old name
+          if (work_unit.marked)
+            {
+              // we are marked, thus we don't know a priori whether parent
+              // is marked or not.
+              get_file_content_marks(app, parent_revision, work_unit.fid, 
parents_interesting_ancestors);
+              parent_marked = (parents_interesting_ancestors.size() == 1
+                               && *(parents_interesting_ancestors.begin()) == 
parent_revision);
+            }
+          else
+            parent_marked = true;
+          
+          // if it's marked, we need to look at its parents instead.
+          if (parent_marked)
+            app.db.get_revision_parents(parent_revision, 
parents_interesting_ancestors);
+          
           rev_height parent_height;
           app.db.get_rev_height(parent_revision, parent_height);
           annotate_node_work newunit(work_unit.annotations,
                                      parent_lineage,
                                      parent_revision,
                                      work_unit.fid,
-                                     parent_height);
-
+                                     parent_height,
+                                     parents_interesting_ancestors,
+                                     file_in_parent,
+                                     parent_marked);
           work_units.insert(newunit);
         }
       else
         {
-          // Already a pending node, so we just have to merge the lineage.
+          // already a pending node, so we just have to merge the lineage.
           L(FL("merging lineage from node %s to parent %s")
             % work_unit.revision % parent_revision);
           
@@ -777,7 +806,7 @@ do_annotate_node
         }
     }
 
-  if (added_in_parent_count == parents.size())
+  if (added_in_parent_count == work_unit.interesting_ancestors.size())
     {
       work_unit.lineage->credit_mapped_lines(work_unit.annotations);
     }
@@ -799,9 +828,18 @@ do_annotate (app_state &app, file_t file
 
   work_units work_units;
   {
+    // prepare the first work_unit
     rev_height height;
     app.db.get_rev_height(rid, height);
-    annotate_node_work workunit(acp, lineage, rid, file_node->self, height);
+    set<revision_id> rids_interesting_ancestors;
+    get_file_content_marks(app, rid, file_node->self, 
rids_interesting_ancestors);
+    bool rid_marked = (rids_interesting_ancestors.size() == 1
+                       && *(rids_interesting_ancestors.begin()) == rid);
+    if (rid_marked)
+      app.db.get_revision_parents(rid, rids_interesting_ancestors);
+    
+    annotate_node_work workunit(acp, lineage, rid, file_node->self, height,
+                                rids_interesting_ancestors, 
file_node->content, rid_marked);
     work_units.insert(workunit);
   }
   
============================================================
--- database.cc cac4ab9d4b50bd45c5888e84619bc6f342570367
+++ database.cc 51154627ac739699fe09fe49e7953757d7804478
@@ -1394,7 +1394,148 @@ struct roster_reconstruction_graph : pub
   }
 };
 
+struct database::extractor
+{
+  virtual bool look_at_delta(roster_delta const & del) = 0;
+  virtual void look_at_roster(roster_t const & roster, marking_map const & mm) 
= 0;
+  virtual ~extractor() {};
+};
+
+struct database::markings_extractor : public database::extractor
+{
+private:
+  node_id const & nid;
+  marking_t & markings;
+
+public:
+  markings_extractor(node_id const & _nid, marking_t & _markings) :
+    nid(_nid), markings(_markings) {} ;
+  
+  bool look_at_delta(roster_delta const & del)
+  {
+    return try_get_markings_from_roster_delta(del, nid, markings);
+  }
+  
+  void look_at_roster(roster_t const & roster, marking_map const & mm)
+  {
+    map<node_id, marking_t>::const_iterator mmi =
+      mm.find(nid);
+    I(mmi != mm.end());
+    markings = mmi->second;
+  }
+};
+
+struct database::file_content_extractor : database::extractor
+{
+private:
+  node_id const & nid;
+  file_id & content;
+
+public:
+  file_content_extractor(node_id const & _nid, file_id & _content) :
+    nid(_nid), content(_content) {} ;
+
+  bool look_at_delta(roster_delta const & del)
+  {
+    return try_get_content_from_roster_delta(del, nid, content);
+  }
+
+  void look_at_roster(roster_t const & roster, marking_map const & mm)
+  {
+    if (roster.has_node(nid))
+      content = downcast_to_file_t(roster.get_node(nid))->content;
+    else
+      content = file_id();
+  }
+};
+
 void
+database::extract_from_deltas(revision_id const & id, extractor & x)
+{
+  reconstruction_path selected_path;
+  {
+    roster_reconstruction_graph graph(*this);
+    {
+      // we look at the nearest delta(s) first, without constructing the
+      // whole path, as that would be a rather expensive operation.
+      //
+      // the reason why this strategy is worth the effort is, that in most
+      // cases we are looking at the parent of a (content-)marked node, thus
+      // the information we are for is right there in the delta leading to
+      // this node.
+      // 
+      // recording the deltas visited here in a set as to avoid inspecting
+      // them later seems to be of little value, as it imposes a cost here,
+      // but can seldom be exploited.
+      set<string> deltas;
+      graph.get_next(id.inner()(), deltas);
+      for (set<string>::const_iterator i = deltas.begin();
+           i != deltas.end(); ++i)
+        {
+          roster_delta del;
+          get_roster_delta(id.inner()(), *i, del);
+          bool found = x.look_at_delta(del);
+          if (found)
+            return;
+        }
+    }
+    get_reconstruction_path(id.inner()(), graph, selected_path);
+  }
+
+  int path_length(selected_path.size());
+  int i(0);
+  string target_rev;
+
+  for (reconstruction_path::const_iterator p = selected_path.begin();
+       p != selected_path.end(); ++p)
+    {
+      if (i > 0)
+        {
+          roster_delta del;
+          get_roster_delta(target_rev, *p, del);
+          bool found = x.look_at_delta(del);
+          if (found)
+            return;
+        }
+      if (i == path_length-1)
+        {
+          // last iteration, we have reached a roster base
+          roster_t roster;
+          marking_map mm;
+          get_roster_base(*p, roster, mm);
+          x.look_at_roster(roster, mm);
+          return;
+        }
+      target_rev = *p;
+      ++i;
+    }
+}
+
+void
+database::get_markings(revision_id const & id,
+                       node_id const & nid,
+                       marking_t & markings)
+{
+  markings_extractor x(nid, markings);
+  extract_from_deltas(id, x);
+}
+
+void
+database::get_file_content(revision_id const & id,
+                           node_id const & nid,
+                           file_id & content)
+{
+  // the imaginary root revision doesn't have any file.
+  if (null_id(id))
+    {
+      content = file_id();
+      return;
+    }
+  file_content_extractor x(nid, content);
+  extract_from_deltas(id, x);
+}
+
+void
 database::get_roster_version(revision_id const & id,
                              cached_roster & cr)
 {
============================================================
--- database.hh ea6f156a45cccbf101e7f6befe00c77b8496def5
+++ database.hh d098635da9410b7c7ecd5a05ead436d51a54a1a3
@@ -345,6 +345,20 @@ public:
   bool roster_version_exists(revision_id const & ident);
   void get_roster_ids(std::set<revision_id> & ids);
 
+  // using roster deltas
+  void get_markings(revision_id const & id,
+                    node_id const & nid,
+                    marking_t & markings);
+
+  void get_file_content(revision_id const & id,
+                        node_id const & nid,
+                        file_id & content);
+private:
+  struct extractor;
+  struct file_content_extractor;
+  struct markings_extractor;
+  void extract_from_deltas(revision_id const & id, extractor & x);
+
   //
   // --== Keys ==--
   //
============================================================
--- roster_delta.cc     66a0a455754209bea533db586769e37b9bd18ca3
+++ roster_delta.cc     4d06fa6a95df2ff254ce9572a57bc0b0d1bb7e52
@@ -475,6 +475,16 @@ delta_rosters(roster_t const & from, mar
   del = roster_delta(printer.buf);
 }
 
+static
+void read_roster_delta(roster_delta const & del,
+                       roster_delta_t & d)
+{
+  basic_io::input_source src(del.inner()(), "roster_delta");
+  basic_io::tokenizer tok(src);
+  basic_io::parser pars(tok);
+  parse_roster_delta_t(pars, d);
+}
+
 void
 apply_roster_delta(roster_delta const & del,
                    roster_t & roster, marking_map & markings)
@@ -483,15 +493,77 @@ apply_roster_delta(roster_delta const & 
   MM(roster);
   MM(markings);
 
-  basic_io::input_source src(del.inner()(), "roster_delta");
-  basic_io::tokenizer tok(src);
-  basic_io::parser pars(tok);
   roster_delta_t d;
-  parse_roster_delta_t(pars, d);
+  read_roster_delta(del, d);
   d.apply(roster, markings);
 }
 
+// Extract the marking for one node from the roster delta, or return false
+// if they are not contained in that delta
+bool
+try_get_markings_from_roster_delta(roster_delta const & del,
+                                   node_id const & nid,
+                                   marking_t & markings)
+{
+  roster_delta_t d;
+  read_roster_delta(del, d);
 
+  std::map<node_id, marking_t>::iterator i = d.markings_changed.find(nid);
+  if (i != d.markings_changed.end())
+    {
+      markings = i->second;
+      return true;
+    }
+  else
+    {
+      return false;
+    }
+}
+
+// Extract the content hash for one node from the roster delta -- if it is
+// available.  If we know what the file_id was, we return true and set
+// 'content' appropriately.  If we can prove that the file did not exist in
+// this revision, we return true and set 'content' to a null id.  If we
+// cannot determine anything about the file contents, then we return false
+// -- in this case content is left undefined.
+bool
+try_get_content_from_roster_delta(roster_delta const & del,
+                              node_id const & nid,
+                              file_id & content)
+{
+  roster_delta_t d;
+  read_roster_delta(del, d);
+  
+  roster_delta_t::deltas_applied_t::const_iterator i = 
d.deltas_applied.find(nid);
+  if (i != d.deltas_applied.end())
+    {
+      content = i->second;
+      return true;
+    }
+
+  // special case 1: the node was deleted, so we know for sure it's not
+  // there anymore in this roster
+  if (d.nodes_deleted.find(nid) != d.nodes_deleted.end())
+    {
+      content = file_id();
+      return true;
+    }
+
+  // special case 2: the node was added, so we need to get the current
+  // content hash from the add stanza
+  for (roster_delta_t::files_added_t::const_iterator j = d.files_added.begin();
+       j != d.files_added.end(); ++j)
+    {
+      if (j->second.first == nid)
+        {
+          content = j->second.second;
+          return true;
+        }
+    }
+   
+  return false;
+}
+
 #ifdef BUILD_UNIT_TESTS
 
 static void
============================================================
--- roster_delta.hh     25e389e2c42e9754480cb7f5e43157623eeaf1fa
+++ roster_delta.hh     85506c111153554e2184a41cae920e4198890bfd
@@ -25,7 +25,17 @@ apply_roster_delta(roster_delta const & 
 apply_roster_delta(roster_delta const & del,
                    roster_t & roster, marking_map & markings);
                    
+bool
+try_get_markings_from_roster_delta(roster_delta const & del,
+                                   node_id const & nid,
+                                   marking_t & markings);
 
+// See the comment on this function's body for a description of its api.
+bool
+try_get_content_from_roster_delta(roster_delta const & del,
+                                  node_id const & nid,
+                                  file_id & content);
+  
 #ifdef BUILD_UNIT_TESTS
 
 // instead of having elaborate tests here, we just export a function, and then

 annotate.cc                                          |  160 +++++++++++--------
 database.cc                                          |  141 ++++++++++++++++
 database.hh                                          |   14 +
 roster_delta.cc                                      |   80 +++++++++
 roster_delta.hh                                      |   10 +
 tests/annotate_accidental_clean_merge/__driver__.lua |   64 +++++++
 6 files changed, 404 insertions(+), 65 deletions(-)

pgp4Qxlxuoegk.pgp
Description: PGP signature

[Prev in Thread]

Current Thread

[Next in Thread]

[Monotone-devel] Speed of Annotate, Thomas Moschny, 2007/02/05
- Re: [Monotone-devel] Speed of Annotate, Thomas Moschny <=

Prev by Date: Re: [Monotone-devel] Question on layering
Next by Date: Re: [Monotone-devel] Question on layering
Previous by thread: [Monotone-devel] Speed of Annotate
Next by thread: [Monotone-devel] cvssync automate extensions ready for merge?
Index(es):
- Date
- Thread