Compare commits

...

2 Commits

Author SHA1 Message Date
4a30b82e39 backup: replace simple max size with rule-based system
Now individual files or patterns can have their own maximum sizes.
2021-10-18 17:43:33 -04:00
ac12b42cad backup: rename force-include to unexclude
Force-include is a misnomer because it won't include files
that weren't considered at all (like files in an excluded subdir).
Instead, call it "unexclude" to make it slightly clearer that this
will just override the exclusions.
2021-10-18 16:25:23 -04:00
2 changed files with 77 additions and 68 deletions

124
backup.py
View File

@ -29,38 +29,25 @@ def pstr(path: bytes) -> str:
def format_size(n: int) -> str: def format_size(n: int) -> str:
return humanfriendly.format_size(n, keep_width=True, binary=True) return humanfriendly.format_size(n, keep_width=True, binary=True)
# Type corresponding to patterns that are generated by
# wcmatch.translate: two lists of compiled REs (a,b). A path matches
# if it matches at least one regex in "a" and none in "b".
MatchPatterns = typing.Tuple[typing.List[re.Pattern], typing.List[re.Pattern]]
class Config: class Config:
roots: typing.List[bytes] roots: typing.List[bytes]
max_file_size: typing.Optional[int]
one_file_system: bool one_file_system: bool
exclude_caches: bool exclude_caches: bool
exclude: typing.List[bytes] exclude: MatchPatterns
force_include: typing.List[bytes] unexclude: MatchPatterns
max_size_rules: typing.List[typing.Tuple[int, MatchPatterns]]
notify_email: typing.Optional[str] notify_email: typing.Optional[str]
def __init__(self, configfile: str): def __init__(self, configfile: str):
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.one_file_system = config.get('one-file-system', False)
self.exclude_caches = config.get('exclude-caches', False)
if 'max-file-size' in config: # Helper to process lists of patterns into regexes
self.max_file_size = humanfriendly.parse_size( def process_match_list(config_entry):
config['max-file-size']) raw = config_entry.encode().split(b'\n')
else:
self.max_file_size = None
raw = config.get('roots', '').encode().split(b'\n')
self.roots = []
for x in raw:
if not len(x):
continue
self.roots.append(x)
self.roots.sort(key=len)
def process_match_list(config_name):
raw = config.get(config_name, '').encode().split(b'\n')
pats = [] pats = []
# Prepend '**/' to any relative patterns # Prepend '**/' to any relative patterns
for x in raw: for x in raw:
@ -70,39 +57,49 @@ class Config:
pats.append(x) pats.append(x)
else: else:
pats.append(b'**/' + x) pats.append(b'**/' + x)
return pats
self.exclude = process_match_list('exclude') # Compile patterns.
self.force_include = process_match_list('force-include') (a, b) = wcmatch.glob.translate(
pats, flags=(wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE))
return ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.one_file_system = config.get('one-file-system', False)
self.exclude_caches = config.get('exclude-caches', False)
raw = config.get('roots', '').encode().split(b'\n')
self.roots = []
for x in raw:
if not len(x):
continue
self.roots.append(x)
self.roots.sort(key=len)
self.exclude = process_match_list(config.get('exclude', ''))
self.unexclude = process_match_list(config.get('unexclude', ''))
self.max_size_rules = []
rules = { humanfriendly.parse_size(k): v
for k, v in config.get('max-size-rules', {}).items() }
for size in reversed(sorted(rules)):
self.max_size_rules.append(
(size, process_match_list(rules[size])))
self.notify_email = config.get('notify-email', None) self.notify_email = config.get('notify-email', None)
# Compile patterns def match_re(self, r: MatchPatterns, path: bytes):
flags = (wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE)
# Path matches if it matches at least one regex in "a" and no
# regex in "b"
(a, b) = wcmatch.glob.translate(self.exclude, flags=flags)
self.exclude_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
(a, b) = wcmatch.glob.translate(self.force_include, flags=flags)
self.force_include_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
def match_re(self,
re: typing.Tuple[typing.List[typing.Pattern],
typing.List[typing.Pattern]],
path: bytes):
# Path matches if it matches at least one regex in # Path matches if it matches at least one regex in
# re[0] and no regex in re[1]. # r[0] and no regex in r[1].
for a in re[0]: for a in r[0]:
if a.match(path): if a.match(path):
for b in re[1]: for b in r[1]:
if b.match(path): if b.match(path):
return False return False
return True return True
@ -166,7 +163,7 @@ class Backup:
# See if there's a reason to exclude it # See if there's a reason to exclude it
exclude_reason = None exclude_reason = None
if self.config.match_re(self.config.exclude_re, decorated_path): if self.config.match_re(self.config.exclude, decorated_path):
# Config file says to exclude # Config file says to exclude
exclude_reason = ('I', f"skipping, excluded by config file") exclude_reason = ('I', f"skipping, excluded by config file")
@ -178,17 +175,24 @@ class Backup:
exclude_reason = ('I', "skipping, on different filesystem") exclude_reason = ('I', "skipping, on different filesystem")
elif (is_reg elif (is_reg
and self.config.max_file_size and len(self.config.max_size_rules)
and size > self.config.max_file_size): and size > self.config.max_size_rules[-1][0]):
# Too big # Check file sizes against our list.
a = format_size(size) # Only need to check if the size is bigger than the smallest
b = format_size(self.config.max_file_size) # entry on the list; then, we need to check it against all rules
exclude_reason = ('W', f"file size {a} exceeds limit {b}") # to see which one applies.
for (max_size, patterns) in self.config.max_size_rules:
if self.config.match_re(patterns, decorated_path):
if size > max_size:
a = format_size(size)
b = format_size(max_size)
exclude_reason = (
'W', f"file size {a} exceeds limit {b}")
break
# If we have a reason to exclude it, stop now unless it's # If we have a reason to exclude it, stop now unless it's
# force-included # force-included
force = self.config.match_re(self.config.force_include_re, force = self.config.match_re(self.config.unexclude, decorated_path)
decorated_path)
if exclude_reason and not force: if exclude_reason and not force:
self.log(exclude_reason[0], self.log(exclude_reason[0],
f"{exclude_reason[1]}: {pstr(path)}") f"{exclude_reason[1]}: {pstr(path)}")

View File

@ -10,12 +10,6 @@ roots: |
one-file-system: true one-file-system: true
exclude-caches: true exclude-caches: true
# Files larger than this are excluded. If a large file isn't
# explicitly mentioned in "excludes" below, it also generates a
# warning. Note that this counts used blocks, so files with large
# holes will still be considered small (since they'll compress easily)
max-file-size: 500MiB
# Files/dirs to exclude from backup. # Files/dirs to exclude from backup.
# Relative paths are treated as if starting with **/ # Relative paths are treated as if starting with **/
# Paths ending in / will only match directories. # Paths ending in / will only match directories.
@ -27,10 +21,21 @@ exclude: |
Steam/ubuntu*/ Steam/ubuntu*/
.cache/ .cache/
# Rules to exclude files based on file size.
# This is a dict of sizes, each with a list of rules.
# For a given path, the largest size with a matching rule applies.
# Matching follows the same behavior as the "exclude" list.
# Size is calculated as used blocks (think "du", not "du --apparent-size").
max-size-rules:
500 MiB: |
*
# 1.0 GiB: |
# *.mp4
# Files that are always included, even if they would have been # Files that are always included, even if they would have been
# excluded due to file size or the "exclude" list. # excluded due to file size or the "exclude" list.
# Matching rules are the same as above. # Matching follows the same behavior as the "exclude" list.
force-include: | unexclude: |
.git/objects/pack/*.pack .git/objects/pack/*.pack
# Email address for notification at end of backup # Email address for notification at end of backup