diff --git a/Makefile b/Makefile index 6f3d2ff..26ac7f9 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ all: @echo .PHONY: ctrl -ctrl: test-setup +ctrl: test-backup .venv: mkdir .venv @@ -19,7 +19,7 @@ ctrl: test-setup .PHONY: test-backup test-backup: .venv .venv/bin/mypy backup.py - ./backup.py --max-size 1GiB --one-file-system /tmp | grep -a 'bigf' + ./backup.py -n >/dev/null .PHONY: test-setup test-setup: diff --git a/Pipfile b/Pipfile index 2edfce6..5d861ec 100644 --- a/Pipfile +++ b/Pipfile @@ -5,9 +5,12 @@ name = "pypi" [packages] humanfriendly = "*" +wcmatch = "*" +pyyaml = "*" [dev-packages] mypy = "*" +types-pyyaml = "*" [requires] python_version = "3" diff --git a/Pipfile.lock b/Pipfile.lock index 4a0b184..4b0a185 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "4f504c785e3ed5b203a82a5f40516507f80a01b8d1d0ad5a905f139cafc41a51" + "sha256": "902260ee06bc3bac3fe1ea87c09d4fc28e5aceef95635b3c72b43b6905050278" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,13 @@ ] }, "default": { + "bracex": { + "hashes": [ + "sha256:01f715cd0ed7a622ec8b32322e715813f7574de531f09b70f6f3b2c10f682425", + "sha256:64e2a6d14de9c8e022cf40539ac8468ba7c4b99550a2b05fc87fd20e392e568f" + ], + "version": "==2.1.1" + }, "humanfriendly": { "hashes": [ "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -23,6 +30,49 @@ ], "index": "pypi", "version": "==10.0" + }, + "pyyaml": { + "hashes": [ + "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf", + "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696", + "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393", + "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77", + "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922", + "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5", + "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8", + "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10", + "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc", + "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018", + "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e", + "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253", + "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347", + "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183", + "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541", + "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb", + "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185", + "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc", + "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db", + "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa", + "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46", + "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122", + "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b", + "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63", + "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df", + "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc", + "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247", + "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6", + "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0" + ], + "index": "pypi", + "version": "==5.4.1" + }, + "wcmatch": { + "hashes": [ + "sha256:4d54ddb506c90b5a5bba3a96a1cfb0bb07127909e19046a71d689ddfb18c3617", + "sha256:9146b1ab9354e0797ef6ef69bc89cb32cb9f46d1b9eeef69c559aeec8f3bffb6" + ], + "index": "pypi", + "version": "==8.2" } }, "develop": { @@ -69,6 +119,14 @@ ], "version": "==0.10.2" }, + "types-pyyaml": { + "hashes": [ + "sha256:1d9e431e9f1f78a65ea957c558535a3b15ad67ea4912bce48a6c1b613dcf81ad", + "sha256:f1d1357168988e45fa20c65aecb3911462246a84809015dd889ebf8b1db74124" + ], + "index": "pypi", + "version": "==5.4.10" + }, "typing-extensions": { "hashes": [ "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e", diff --git a/README.md b/README.md index d19b608..43cae98 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Run on client: sudo git clone https://git.jim.sh/jim/borg-setup.git /opt/borg sudo /opt/borg/initial-setup.sh -Customize `/opt/borg/backup.yaml` as desired. +Customize `/opt/borg/config.yaml` as desired. @@ -98,6 +98,6 @@ Design /etc/systemd/system/borg-backup.timer -> /opt/borg/borg-backup.timer - Backup script `/opt/borg/backup.py` uses configuration in - `/opt/borg/backup.yaml` to generate our own list of files, excluding + `/opt/borg/config.yaml` to generate our own list of files, excluding anything that's too large by default. This requires borg 1.2.0b1 or newer, which is why the setup scripts download a specific version. diff --git a/backup.py b/backup.py index c1ae36f..56b97d9 100755 --- a/backup.py +++ b/backup.py @@ -1,62 +1,106 @@ #!.venv/bin/python +# Scan filesystem to generate a list of files to back up, based on a +# configuration file. Pass this list to borg to actually create the +# backup. Execute a notification script on the remote server to +# report the backup status. + import os +import re import sys import stat -from typing import Optional, Tuple -import humanfriendly # type: ignore -import wcmatch.glob # type: ignore -import re -import dataclasses -import enum - -class MatchResult(enum.Enum): - INCLUDE_IF_SIZE_OK = 0 - INCLUDE_ALWAYS = 1 - EXCLUDE_ALWAYS = 2 - -@dataclasses.dataclass -class PatternRule: - re_inc: list[re.Pattern] - re_exc: list[re.Pattern] - - def match(self, path: str) -> Tuple[bool, bool]: - if "big" in path: - print(self, file=sys.stderr) - - for inc in self.re_inc: - if inc.match(path): - break - else: - return - - for exc in self.re_exc: - if exc.match(path): - return False - return True - -class Lister: - def __init__(self, one_file_system: bool, max_size: bool): - self.one_file_system = one_file_system - self.max_size = max_size - if max_size is None: - max_size = float('inf') - self.stdout = os.fdopen(sys.stdout.fileno(), "wb", closefd=False) +import pathlib - # Remember files we've skipped because they were too big, so that - # we can warn again at the end. - self.skipped_size: set[bytes] = set() +import typing - # Remember errors - self.skipped_error: set[bytes] = set() +import yaml +import wcmatch.glob # type: ignore +import humanfriendly # type: ignore - def __del__(self): - self.stdout.close() +class Config: + root: str + max_file_size: typing.Optional[int] + one_file_system: bool + exclude: list[bytes] + force_include: list[bytes] + notify_email: typing.Optional[str] + + def __init__(self, configfile: str): + # Read config + with open(configfile, 'r') as f: + config = yaml.safe_load(f) + self.root = config['root'].encode() + self.one_file_system = config.get('one-file-system', False) + + if 'max-file-size' in config: + self.max_file_size = humanfriendly.parse_size( + config['max-file-size']) + else: + self.max_file_size = None + + utf = config.get('exclude', '').encode() + self.exclude = list(filter(len, utf.split(b'\n'))) + + utf = config.get('force-include', '').encode() + self.force_include = list(filter(len, utf.split(b'\n'))) + + self.notify_email = config.get('notify-email', None) + + # Compile patterns + flags = (wcmatch.glob.GLOBSTAR | + wcmatch.glob.DOTGLOB | + wcmatch.glob.NODOTDIR | + wcmatch.glob.EXTGLOB | + wcmatch.glob.BRACE) + + # Path matches if it matches at least one regex in "a" and no + # regex in "b" + (a, b) = wcmatch.glob.translate(self.exclude, flags=flags) + self.exclude_re = ([ re.compile(x) for x in a ], + [ re.compile(x) for x in b ]) + + (a, b) = wcmatch.glob.translate(self.force_include, flags=flags) + self.force_include_re = ([ re.compile(x) for x in a ], + [ re.compile(x) for x in b ]) + + def match_compiled(self, re: tuple[list[typing.Pattern], + list[typing.Pattern]], + path: bytes): + # Path matches if it matches at least one regex in + # re[0] and no regex in re[1] + for a in re[0]: + if a.match(path): + for b in re[1]: + if b.match(path): + return False + return True + return False + + def __str__(self): + d = { 'root': self.root } + if self.max_file_size: + d['max-file-size'] = self.max_file_size + if self.exclude: + utf = b'\n'.join(self.exclude) + d['exclude'] = utf.decode(errors='backslashreplace') + if self.force_include: + utf = b'\n'.join(self.force_include) + d['force-include'] = utf.decode(errors='backslashreplace') + if self.notify_email: + d['notify-email'] = self.notify_email + return yaml.dump(d, default_flow_style=False) + +class Backup: + def __init__(self, config: Config, dry_run: bool, out: typing.BinaryIO): + self.config = config + self.outfile = out + self.dry_run = dry_run + + # All logged messages, with severity + self.logs: list[tuple[str, str]] = [] def out(self, path: bytes): - # Use '\0\n' as a separator, so that we can both separate it - # cleanly in Borg, and also view it on stdout. - self.stdout.write(path + b'\0\n') + self.outfile.write(path + (b'\n' if self.dry_run else b'\0')) def log(self, letter: str, msg: str): colors = { 'E': 31, 'W': 33, 'I': 36 }; @@ -65,93 +109,78 @@ class Lister: else: c = 0 sys.stderr.write(f"\033[1;{c}m{letter}:\033[22m {msg}\033[0m\n") + self.logs.append((letter, msg)) + + def run(self): + self.scan(self.config.root) - def scan(self, path: bytes, - parent_st: os.stat_result=None, - rules: list[PatternRule]=[]): + def scan(self, path: bytes, parent_st: os.stat_result=None): """If the given path should be backed up, print it. If it's - a directory and its contents should be included, recurse.""" + a directory and its contents should be included, recurse.""" - # Copy the path in string form, for logging and pathspec - # parsing. Otherwise, we use bytes directly. + # Copy the path in string form, for logging. Otherwise, we use + # bytes directly. pathstr = path.decode(errors='backslashreplace') try: - # See if we match any rules - for r in rules: - if r.match(pathstr): - self.log('I', f"ignore {pathstr}") - return - - # Stat the path - st = os.lstat(path) - is_dir = stat.S_ISDIR(st.st_mode) - - if is_dir: - # Skip if it crosses a mount point - if self.one_file_system: - if parent_st is not None and st.st_dev != parent_st.st_dev: - self.log('I', f"skipping {pathstr}: " - "on different filesystem") - return - - # Add contents of any .nobackup file to our - # parser rules - child_rules = rules - - try: - def prepend_base(regex): - if regex[0] != '^': - raise Exception(f'bad regex: {regex}') - return '^' + os.path.join(pathstr, '') + regex[1:] - with open(os.path.join(path, b".nobackup")) as f: - rule = PatternRule([], []) - for line in f: - if line[0] == '#': - continue - (inc, exc) = wcmatch.glob.translate( - [ line.rstrip('\r\n') ], - flags=(wcmatch.glob.NEGATE | - wcmatch.glob.GLOBSTAR | - wcmatch.glob.DOTGLOB | - wcmatch.glob.EXTGLOB | - wcmatch.glob.BRACE)) - for x in inc: - rule.re_inc.append(re.compile(prepend_base(x))) - for x in exc: - rule.re_exc.append(re.compile(prepend_base(x))) - child_rules.append(rule) - except FileNotFoundError: - pass - - # Recurse and process each entry + # See if this path should be excluded or force-included + + # Only stat the file when we need it + cached_st = None + def st(): + nonlocal cached_st + if not cached_st: + cached_st = os.lstat(path) + return cached_st + + # See if there's a reason to exclude it + exclude_reason = None + + if self.config.match_compiled(self.config.exclude_re, path): + # Config file says to exclude + exclude_reason = ('I', f"skipping, excluded by config file") + + elif (stat.S_ISDIR(st().st_mode) + and self.config.one_file_system + and parent_st is not None + and st().st_dev != parent_st.st_dev): + # Crosses a mount point + exclude_reason = ('I', "skipping, on different filesystem") + + elif (stat.S_ISREG(st().st_mode) + and self.config.max_file_size + and st().st_size > self.config.max_file_size): + # Too big + def format_size(n): + return humanfriendly.format_size( + n, keep_width=True, binary=True) + a = format_size(st().st_size) + b = format_size(self.config.max_file_size) + exclude_reason = ('W', f"file size {a} exceeds limit {b}") + + # If we have a reason to exclude it, stop now unless it's + # force-included + if (exclude_reason + and not self.config.match_compiled( + self.config.force_include_re, path)): + + self.log(exclude_reason[0], f"{exclude_reason[1]}: {pathstr}") + return + + # Print name of this path + self.out(path) + + # If it's a directory, recurse + if stat.S_ISDIR(st().st_mode): with os.scandir(path) as it: for entry in it: - self.scan(entry.path, st, child_rules) - - else: - # For regular files, ensure they're not too big - if stat.S_ISREG(st.st_mode) and st.st_size > self.max_size: - def format_size(n): - return humanfriendly.format_size( - n, keep_width=True, binary=True) - a = format_size(st.st_size) - b = format_size(self.max_size) - self.log('W', f"skipping {pathstr}: " - + f"file size {a} exceeds limit {b}") - self.skipped_size.add(path) - return - - # Every other filename gets printed; devices, symlinks, etc - # will get handled by Borg - self.out(path) + self.scan(path=entry.path, parent_st=st()) except PermissionError as e: self.log('E', f"can't read {pathstr}") - self.skipped_error.add(path) return -def main(argv): +def main(argv: list[str]): import argparse def humansize(string): @@ -159,21 +188,19 @@ def main(argv): parser = argparse.ArgumentParser( prog=argv[0], - description="Build up a directory and file list for backups") + description="Back up the local system using borg", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-s', '--max-size', type=humansize, - help="Ignore files bigger than this, by default") - parser.add_argument('-x', '--one-file-system', action='store_true', - help="Don't cross mount points when recursing") - parser.add_argument('dirs', metavar='DIR', nargs='+', - help="Root directories to scan recursively") + default_config = str(pathlib.Path(__file__).parent / "config.yaml") + parser.add_argument('-c', '--config', + help="Config file", default=default_config) + parser.add_argument('-n', '--dry-run', action="store_true", + help="Just print filenames, don't run borg") args = parser.parse_args() - - lister = Lister(one_file_system=args.one_file_system, - max_size=args.max_size) - for p in args.dirs: - lister.scan(os.fsencode(p)) + config = Config(args.config) + backup = Backup(config, args.dry_run, sys.stdout.buffer) + backup.run() if __name__ == "__main__": import sys diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..4b15783 --- /dev/null +++ b/config.yaml @@ -0,0 +1,23 @@ +root: "/tmp" +one-file-system: true + +# Files larger than this are excluded. If a large file isn't +# explicitly mentioned in "excludes" below, it also generates a +# warning. +max-file-size: 500MiB + +# Files/dirs to exclude from backup. +# Paths should be absolute, or start with **/ +exclude: | + **/Steam/steamapps + **/Steam/ubuntu* + /tmp/bigfile + /tmp/out.ps + +# Files that are always included, even if they would have been +# excluded due to file size or the "exclude" list. +# Paths should be absolute, or start with **/ +force-include: | + +# Email address for notification at end of backup +notify-email: jim@jim.sh diff --git a/initial-setup.sh b/initial-setup.sh index 64e0622..c6c6270 100755 --- a/initial-setup.sh +++ b/initial-setup.sh @@ -192,6 +192,7 @@ EOF run_ssh_command "if cmp -s $backup $keys; then rm $backup ; fi" run_ssh_command "cat >> .ssh/authorized_keys" <