Compare commits

...

2 Commits

Author SHA1 Message Date
4a30b82e39 backup: replace simple max size with rule-based system
Now individual files or patterns can have their own maximum sizes.
2021-10-18 17:43:33 -04:00
ac12b42cad backup: rename force-include to unexclude
Force-include is a misnomer because it won't include files
that weren't considered at all (like files in an excluded subdir).
Instead, call it "unexclude" to make it slightly clearer that this
will just override the exclusions.
2021-10-18 16:25:23 -04:00
2 changed files with 77 additions and 68 deletions

124
backup.py
View File

@ -29,38 +29,25 @@ def pstr(path: bytes) -> str:
def format_size(n: int) -> str:
return humanfriendly.format_size(n, keep_width=True, binary=True)
# Type corresponding to patterns that are generated by
# wcmatch.translate: two lists of compiled REs (a,b). A path matches
# if it matches at least one regex in "a" and none in "b".
MatchPatterns = typing.Tuple[typing.List[re.Pattern], typing.List[re.Pattern]]
class Config:
roots: typing.List[bytes]
max_file_size: typing.Optional[int]
one_file_system: bool
exclude_caches: bool
exclude: typing.List[bytes]
force_include: typing.List[bytes]
exclude: MatchPatterns
unexclude: MatchPatterns
max_size_rules: typing.List[typing.Tuple[int, MatchPatterns]]
notify_email: typing.Optional[str]
def __init__(self, configfile: str):
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.one_file_system = config.get('one-file-system', False)
self.exclude_caches = config.get('exclude-caches', False)
if 'max-file-size' in config:
self.max_file_size = humanfriendly.parse_size(
config['max-file-size'])
else:
self.max_file_size = None
raw = config.get('roots', '').encode().split(b'\n')
self.roots = []
for x in raw:
if not len(x):
continue
self.roots.append(x)
self.roots.sort(key=len)
def process_match_list(config_name):
raw = config.get(config_name, '').encode().split(b'\n')
# Helper to process lists of patterns into regexes
def process_match_list(config_entry):
raw = config_entry.encode().split(b'\n')
pats = []
# Prepend '**/' to any relative patterns
for x in raw:
@ -70,39 +57,49 @@ class Config:
pats.append(x)
else:
pats.append(b'**/' + x)
return pats
self.exclude = process_match_list('exclude')
self.force_include = process_match_list('force-include')
# Compile patterns.
(a, b) = wcmatch.glob.translate(
pats, flags=(wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE))
return ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.one_file_system = config.get('one-file-system', False)
self.exclude_caches = config.get('exclude-caches', False)
raw = config.get('roots', '').encode().split(b'\n')
self.roots = []
for x in raw:
if not len(x):
continue
self.roots.append(x)
self.roots.sort(key=len)
self.exclude = process_match_list(config.get('exclude', ''))
self.unexclude = process_match_list(config.get('unexclude', ''))
self.max_size_rules = []
rules = { humanfriendly.parse_size(k): v
for k, v in config.get('max-size-rules', {}).items() }
for size in reversed(sorted(rules)):
self.max_size_rules.append(
(size, process_match_list(rules[size])))
self.notify_email = config.get('notify-email', None)
# Compile patterns
flags = (wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE)
# Path matches if it matches at least one regex in "a" and no
# regex in "b"
(a, b) = wcmatch.glob.translate(self.exclude, flags=flags)
self.exclude_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
(a, b) = wcmatch.glob.translate(self.force_include, flags=flags)
self.force_include_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])
def match_re(self,
re: typing.Tuple[typing.List[typing.Pattern],
typing.List[typing.Pattern]],
path: bytes):
def match_re(self, r: MatchPatterns, path: bytes):
# Path matches if it matches at least one regex in
# re[0] and no regex in re[1].
for a in re[0]:
# r[0] and no regex in r[1].
for a in r[0]:
if a.match(path):
for b in re[1]:
for b in r[1]:
if b.match(path):
return False
return True
@ -166,7 +163,7 @@ class Backup:
# See if there's a reason to exclude it
exclude_reason = None
if self.config.match_re(self.config.exclude_re, decorated_path):
if self.config.match_re(self.config.exclude, decorated_path):
# Config file says to exclude
exclude_reason = ('I', f"skipping, excluded by config file")
@ -178,17 +175,24 @@ class Backup:
exclude_reason = ('I', "skipping, on different filesystem")
elif (is_reg
and self.config.max_file_size
and size > self.config.max_file_size):
# Too big
a = format_size(size)
b = format_size(self.config.max_file_size)
exclude_reason = ('W', f"file size {a} exceeds limit {b}")
and len(self.config.max_size_rules)
and size > self.config.max_size_rules[-1][0]):
# Check file sizes against our list.
# Only need to check if the size is bigger than the smallest
# entry on the list; then, we need to check it against all rules
# to see which one applies.
for (max_size, patterns) in self.config.max_size_rules:
if self.config.match_re(patterns, decorated_path):
if size > max_size:
a = format_size(size)
b = format_size(max_size)
exclude_reason = (
'W', f"file size {a} exceeds limit {b}")
break
# If we have a reason to exclude it, stop now unless it's
# force-included
force = self.config.match_re(self.config.force_include_re,
decorated_path)
force = self.config.match_re(self.config.unexclude, decorated_path)
if exclude_reason and not force:
self.log(exclude_reason[0],
f"{exclude_reason[1]}: {pstr(path)}")

View File

@ -10,12 +10,6 @@ roots: |
one-file-system: true
exclude-caches: true
# Files larger than this are excluded. If a large file isn't
# explicitly mentioned in "excludes" below, it also generates a
# warning. Note that this counts used blocks, so files with large
# holes will still be considered small (since they'll compress easily)
max-file-size: 500MiB
# Files/dirs to exclude from backup.
# Relative paths are treated as if starting with **/
# Paths ending in / will only match directories.
@ -27,10 +21,21 @@ exclude: |
Steam/ubuntu*/
.cache/
# Rules to exclude files based on file size.
# This is a dict of sizes, each with a list of rules.
# For a given path, the largest size with a matching rule applies.
# Matching follows the same behavior as the "exclude" list.
# Size is calculated as used blocks (think "du", not "du --apparent-size").
max-size-rules:
500 MiB: |
*
# 1.0 GiB: |
# *.mp4
# Files that are always included, even if they would have been
# excluded due to file size or the "exclude" list.
# Matching rules are the same as above.
force-include: |
# Matching follows the same behavior as the "exclude" list.
unexclude: |
.git/objects/pack/*.pack
# Email address for notification at end of backup