rdiff-backup-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Rdiff-backup-commits] Changes to rdiff-backup/rdiff_backup/metadata.py


From: Ben Escoto
Subject: [Rdiff-backup-commits] Changes to rdiff-backup/rdiff_backup/metadata.py [r1-0]
Date: Fri, 25 Nov 2005 19:17:46 -0500

Index: rdiff-backup/rdiff_backup/metadata.py
diff -u /dev/null rdiff-backup/rdiff_backup/metadata.py:1.16.2.1
--- /dev/null   Sat Nov 26 00:17:46 2005
+++ rdiff-backup/rdiff_backup/metadata.py       Sat Nov 26 00:17:46 2005
@@ -0,0 +1,419 @@
+# Copyright 2002 Ben Escoto
+#
+# This file is part of rdiff-backup.
+#
+# rdiff-backup is free software; you can redistribute it and/or modify
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# rdiff-backup is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with rdiff-backup; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+
+"""Store and retrieve metadata in destination directory
+
+The plan is to store metadata information for all files in the
+destination directory in a special metadata file.  There are two
+reasons for this:
+
+1)  The filesystem of the mirror directory may not be able to handle
+    types of metadata that the source filesystem can.  For instance,
+    rdiff-backup may not have root access on the destination side, so
+    cannot set uid/gid.  Or the source side may have ACLs and the
+    destination side doesn't.
+
+       Hopefully every file system can store binary data.  Storing
+       metadata separately allows us to back up anything (ok, maybe
+       strange filenames are still a problem).
+
+2)  Metadata can be more quickly read from a file than it can by
+    traversing the mirror directory over and over again.  In many
+    cases most of rdiff-backup's time is spent compaing metadata (like
+    file size and modtime), trying to find differences.  Reading this
+    data sequentially from a file is significantly less taxing than
+    listing directories and statting files all over the mirror
+    directory.
+
+The metadata is stored in a text file, which is a bunch of records
+concatenated together.  Each record has the format:
+
+File <filename>
+  <field_name1> <value>
+  <field_name2> <value>
+  ...
+
+Where the lines are separated by newlines.  See the code below for the
+field names and values.
+
+"""
+
+from __future__ import generators
+import re, gzip, os, binascii
+import log, Globals, rpath, Time, robust, increment, static
+
+class ParsingError(Exception):
+       """This is raised when bad or unparsable data is received"""
+       pass
+
+def carbonfile2string(cfile):
+       """Convert CarbonFile data to a string suitable for storing."""
+       retvalparts = []
+       retvalparts.append('creator:%s' % binascii.hexlify(cfile['creator']))
+       retvalparts.append('type:%s' % binascii.hexlify(cfile['type']))
+       retvalparts.append('location:%d,%d' % cfile['location'])
+       retvalparts.append('flags:%d' % cfile['flags'])
+       return '|'.join(retvalparts)
+
+def string2carbonfile(data):
+       """Re-constitute CarbonFile data from a string stored by 
+       carbonfile2string."""
+       retval = {}
+       for component in data.split('|'):
+               key, value = component.split(':')
+               if key == 'creator':
+                       retval['creator'] = binascii.unhexlify(value)
+               elif key == 'type':
+                       retval['type'] = binascii.unhexlify(value)
+               elif key == 'location':
+                       a, b = value.split(',')
+                       retval['location'] = (int(a), int(b))
+               elif key == 'flags':
+                       retval['flags'] = int(value)
+       return retval
+
+def RORP2Record(rorpath):
+       """From RORPath, return text record of file's metadata"""
+       str_list = ["File %s\n" % quote_path(rorpath.get_indexpath())]
+
+       # Store file type, e.g. "dev", "reg", or "sym", and type-specific data
+       type = rorpath.gettype()
+       if type is None: type = "None"
+       str_list.append("  Type %s\n" % type)
+       if type == "reg":
+               str_list.append("  Size %s\n" % rorpath.getsize())
+
+               # If there is a resource fork, save it.
+               if rorpath.has_resource_fork():
+                       if not rorpath.get_resource_fork(): rf = "None"
+                       else: rf = binascii.hexlify(rorpath.get_resource_fork())
+                       str_list.append("  ResourceFork %s\n" % (rf,))
+                
+               # If there is Carbon data, save it.
+               if rorpath.has_carbonfile():
+                       if not rorpath.get_carbonfile(): cfile = "None"
+                       else: cfile = 
carbonfile2string(rorpath.get_carbonfile())
+                       str_list.append("  CarbonFile %s\n" % (cfile,))
+
+               # If file is hardlinked, add that information
+               if Globals.preserve_hardlinks:
+                       numlinks = rorpath.getnumlinks()
+                       if numlinks > 1:
+                               str_list.append("  NumHardLinks %s\n" % 
numlinks)
+                               str_list.append("  Inode %s\n" % 
rorpath.getinode())
+                               str_list.append("  DeviceLoc %s\n" % 
rorpath.getdevloc())
+
+       elif type == "None": return "".join(str_list)
+       elif type == "dir" or type == "sock" or type == "fifo": pass
+       elif type == "sym":
+               str_list.append("  SymData %s\n" % 
quote_path(rorpath.readlink()))
+       elif type == "dev":
+               major, minor = rorpath.getdevnums()
+               if rorpath.isblkdev(): devchar = "b"
+               else:
+                       assert rorpath.ischardev()
+                       devchar = "c"
+               str_list.append("  DeviceNum %s %s %s\n" % (devchar, major, 
minor))
+
+       # Store time information
+       if type != 'sym' and type != 'dev':
+               str_list.append("  ModTime %s\n" % rorpath.getmtime())
+
+       # Add user, group, and permission information
+       uid, gid = rorpath.getuidgid()
+       str_list.append("  Uid %s\n" % uid)
+       str_list.append("  Uname %s\n" % (rorpath.getuname() or ":"))
+       str_list.append("  Gid %s\n" % gid)
+       str_list.append("  Gname %s\n" % (rorpath.getgname() or ":"))
+       str_list.append("  Permissions %s\n" % rorpath.getperms())
+       return "".join(str_list)
+
+line_parsing_regexp = re.compile("^ *([A-Za-z0-9]+) (.+)$", re.M)
+def Record2RORP(record_string):
+       """Given record_string, return RORPath
+
+       For speed reasons, write the RORPath data dictionary directly
+       instead of calling rorpath functions.  Profiling has shown this to
+       be a time critical function.
+
+       """
+       data_dict = {}
+       for field, data in line_parsing_regexp.findall(record_string):
+               if field == "File": index = quoted_filename_to_index(data)
+               elif field == "Type":
+                       if data == "None": data_dict['type'] = None
+                       else: data_dict['type'] = data
+               elif field == "Size": data_dict['size'] = long(data)
+               elif field == "ResourceFork":
+                       if data == "None": data_dict['resourcefork'] = ""
+                       else: data_dict['resourcefork'] = 
binascii.unhexlify(data)
+               elif field == "CarbonFile":
+                       if data == "None": data_dict['carbonfile'] = None
+                       else: data_dict['carbonfile'] = string2carbonfile(data)
+               elif field == "NumHardLinks": data_dict['nlink'] = int(data)
+               elif field == "Inode": data_dict['inode'] = long(data)
+               elif field == "DeviceLoc": data_dict['devloc'] = long(data)
+               elif field == "SymData": data_dict['linkname'] = 
unquote_path(data)
+               elif field == "DeviceNum":
+                       devchar, major_str, minor_str = data.split(" ")
+                       data_dict['devnums'] = (devchar, int(major_str), 
int(minor_str))
+               elif field == "ModTime": data_dict['mtime'] = long(data)
+               elif field == "Uid": data_dict['uid'] = int(data)
+               elif field == "Gid": data_dict['gid'] = int(data)
+               elif field == "Uname":
+                       if data == ":" or data == 'None': data_dict['uname'] = 
None
+                       else: data_dict['uname'] = data
+               elif field == "Gname":
+                       if data == ':' or data == 'None': data_dict['gname'] = 
None
+                       else: data_dict['gname'] = data
+               elif field == "Permissions": data_dict['perms'] = int(data)
+               else: log.Log("Unknown field in line '%s %s'" % (field, data), 
2)
+       return rpath.RORPath(index, data_dict)
+
+chars_to_quote = re.compile("\\n|\\\\")
+def quote_path(path_string):
+       """Return quoted verson of path_string
+
+       Because newlines are used to separate fields in a record, they are
+       replaced with \n.  Backslashes become \\ and everything else is
+       left the way it is.
+
+       """
+       def replacement_func(match_obj):
+               """This is called on the match obj of any char that needs 
quoting"""
+               char = match_obj.group(0)
+               if char == "\n": return "\\n"
+               elif char == "\\": return "\\\\"
+               assert 0, "Bad char %s needs quoting" % char
+       return chars_to_quote.sub(replacement_func, path_string)
+
+def unquote_path(quoted_string):
+       """Reverse what was done by quote_path"""
+       def replacement_func(match_obj):
+               """Unquote match obj of two character sequence"""
+               two_chars = match_obj.group(0)
+               if two_chars == "\\n": return "\n"
+               elif two_chars == "\\\\": return "\\"
+               log.Log("Warning, unknown quoted sequence %s found" % 
two_chars, 2)
+               return two_chars
+       return re.sub("\\\\n|\\\\\\\\", replacement_func, quoted_string)
+
+def quoted_filename_to_index(quoted_filename):
+       """Return tuple index given quoted filename"""
+       if quoted_filename == '.': return ()
+       else: return tuple(unquote_path(quoted_filename).split('/'))
+
+class FlatExtractor:
+       """Controls iterating objects from flat file"""
+
+       # Set this in subclass.  record_boundary_regexp should match
+       # beginning of next record.  The first group should start at the
+       # beginning of the record.  The second group should contain the
+       # (possibly quoted) filename.
+       record_boundary_regexp = None
+
+       # Set in subclass to function that converts text record to object
+       record_to_object = None
+
+       def __init__(self, fileobj):
+               self.fileobj = fileobj # holds file object we are reading from
+               self.buf = "" # holds the next part of the file
+               self.at_end = 0 # True if we are at the end of the file
+               self.blocksize = 32 * 1024
+
+       def get_next_pos(self):
+               """Return position of next record in buffer, or end pos if 
none"""
+               while 1:
+                       m = self.record_boundary_regexp.search(self.buf, 1)
+                       if m: return m.start(1)
+                       else: # add next block to the buffer, loop again
+                               newbuf = self.fileobj.read(self.blocksize)
+                               if not newbuf:
+                                       self.at_end = 1
+                                       return len(self.buf)
+                               else: self.buf += newbuf
+
+       def iterate(self):
+               """Return iterator that yields all objects with records"""
+               while 1:
+                       next_pos = self.get_next_pos()
+                       try: yield self.record_to_object(self.buf[:next_pos])
+                       except ParsingError, e:
+                               if self.at_end: break # Ignore whitespace/bad 
records at end
+                               log.Log("Error parsing flat file: %s" % (e,), 2)
+                       if self.at_end: break
+                       self.buf = self.buf[next_pos:]
+               assert not self.close()
+
+       def skip_to_index(self, index):
+               """Scan through the file, set buffer to beginning of index 
record
+
+               Here we make sure that the buffer always ends in a newline, so
+               we will not be splitting lines in half.
+
+               """
+               assert not self.buf or self.buf.endswith("\n")
+               while 1:
+                       self.buf = self.fileobj.read(self.blocksize)
+                       self.buf += self.fileobj.readline()
+                       if not self.buf:
+                               self.at_end = 1
+                               return
+                       while 1:
+                               m = self.record_boundary_regexp.search(self.buf)
+                               if not m: break
+                               cur_index = self.filename_to_index(m.group(2))
+                               if cur_index >= index:
+                                       self.buf = self.buf[m.start(1):]
+                                       return
+                               else: self.buf = self.buf[m.end(1):]
+
+       def iterate_starting_with(self, index):
+               """Iterate objects whose index starts with given index"""
+               self.skip_to_index(index)
+               if self.at_end: return
+               while 1:
+                       next_pos = self.get_next_pos()
+                       try: obj = self.record_to_object(self.buf[:next_pos])
+                       except ParsingError, e:
+                               log.Log("Error parsing metadata file: %s" % 
(e,), 2)
+                       else:
+                               if obj.index[:len(index)] != index: break
+                               yield obj
+                       if self.at_end: break
+                       self.buf = self.buf[next_pos:]
+               assert not self.close()
+
+       def filename_to_index(self, filename):
+               """Translate filename, possibly quoted, into an index tuple
+
+               The filename is the first group matched by
+               regexp_boundary_regexp.
+
+               """
+               assert 0 # subclass
+
+       def close(self):
+               """Return value of closing associated file"""
+               return self.fileobj.close()
+
+class RorpExtractor(FlatExtractor):
+       """Iterate rorps from metadata file"""
+       record_boundary_regexp = re.compile("(?:\\n|^)(File (.*?))\\n")
+       record_to_object = staticmethod(Record2RORP)
+       filename_to_index = staticmethod(quoted_filename_to_index)
+
+
+class FlatFile:
+       """Manage a flat (probably text) file containing info on various files
+
+       This is used for metadata information, and possibly EAs and ACLs.
+       The main read interface is as an iterator.  The storage format is
+       a flat, probably compressed file, so random access is not
+       recommended.
+
+       """
+       _prefix = None # Set this to real prefix when subclassing
+       _rp, _fileobj = None, None
+       # Buffering may be useful because gzip writes are slow
+       _buffering_on = 1
+       _record_buffer, _max_buffer_size = None, 100
+       _extractor = FlatExtractor # Set to class that iterates objects
+
+       def open_file(cls, rp = None, compress = 1):
+               """Open file for writing.  Use cls._rp if rp not given."""
+               assert not cls._fileobj, "Flatfile already open"
+               cls._record_buffer = []
+               if rp: cls._rp = rp
+               else:
+                       if compress: typestr = 'snapshot.gz'
+                       else: typestr = 'snapshot'
+                       cls._rp = Globals.rbdir.append(
+                               "%s.%s.%s" % (cls._prefix, Time.curtimestr, 
typestr))
+               cls._fileobj = cls._rp.open("wb", compress = compress)
+
+       def write_object(cls, object):
+               """Convert one object to record and write to file"""
+               record = cls._object_to_record(object)
+               if cls._buffering_on:
+                       cls._record_buffer.append(record)
+                       if len(cls._record_buffer) >= cls._max_buffer_size:
+                               cls._fileobj.write("".join(cls._record_buffer))
+                               cls._record_buffer = []
+               else: cls._fileobj.write(record)
+
+       def close_file(cls):
+               """Close file, for when any writing is done"""
+               assert cls._fileobj, "File already closed"
+               if cls._buffering_on and cls._record_buffer: 
+                       cls._fileobj.write("".join(cls._record_buffer))
+                       cls._record_buffer = []
+               try: fileno = cls._fileobj.fileno() # will not work if GzipFile
+               except AttributeError: fileno = cls._fileobj.fileobj.fileno()
+               os.fsync(fileno)
+               result = cls._fileobj.close()
+               cls._fileobj = None
+               cls._rp.setdata()
+               return result
+
+       def get_objects(cls, restrict_index = None, compressed = None):
+               """Return iterator of objects records from file rp"""
+               assert cls._rp, "Must have rp set before get_objects can be 
used"
+               if compressed is None:
+                       if cls._rp.isincfile():
+                               compressed = cls._rp.inc_compressed
+                               assert (cls._rp.inc_type == 'data' or
+                                               cls._rp.inc_type == 
'snapshot'), cls._rp.inc_type
+                       else: compressed = 
cls._rp.get_indexpath().endswith('.gz')
+
+               fileobj = cls._rp.open('rb', compress = compressed)
+               if not restrict_index: return cls._extractor(fileobj).iterate()
+               else:
+                       re = cls._extractor(fileobj)
+                       return re.iterate_starting_with(restrict_index)
+               
+       def get_objects_at_time(cls, rbdir, time, restrict_index = None,
+                                                       rblist = None):
+               """Scan through rbdir, finding data at given time, iterate
+
+               If rblist is givenr, use that instead of listing rbdir.  Time
+               here is exact, we don't take the next one older or anything.
+               Returns None if no file matching prefix is found.
+
+               """
+               if rblist is None:
+                       rblist = map(lambda x: rbdir.append(x), 
robust.listrp(rbdir))
+
+               for rp in rblist:
+                       if (rp.isincfile() and
+                               (rp.getinctype() == "data" or rp.getinctype() 
== "snapshot")
+                               and rp.getincbase_str() == cls._prefix):
+                               if rp.getinctime() == time:
+                                       cls._rp = rp
+                                       return cls.get_objects(restrict_index)
+               return None
+
+static.MakeClass(FlatFile)
+
+class MetadataFile(FlatFile):
+       """Store/retrieve metadata from mirror_metadata as rorps"""
+       _prefix = "mirror_metadata"
+       _extractor = RorpExtractor
+       _object_to_record = staticmethod(RORP2Record)
+




reply via email to

[Prev in Thread] Current Thread [Next in Thread]