Browse Source

Implement filesystem scanning with configurable filters

master
Jim Paris 5 days ago
parent
commit
0039ca1ee0
7 changed files with 252 additions and 140 deletions
  1. +2
    -2
      Makefile
  2. +3
    -0
      Pipfile
  3. +59
    -1
      Pipfile.lock
  4. +2
    -2
      README.md
  5. +162
    -135
      backup.py
  6. +23
    -0
      config.yaml
  7. +1
    -0
      initial-setup.sh

+ 2
- 2
Makefile View File

@@ -10,7 +10,7 @@ all:
@echo

.PHONY: ctrl
ctrl: test-setup
ctrl: test-backup

.venv:
mkdir .venv
@@ -19,7 +19,7 @@ ctrl: test-setup
.PHONY: test-backup
test-backup: .venv
.venv/bin/mypy backup.py
./backup.py --max-size 1GiB --one-file-system /tmp | grep -a 'bigf'
./backup.py -n >/dev/null

.PHONY: test-setup
test-setup:


+ 3
- 0
Pipfile View File

@@ -5,9 +5,12 @@ name = "pypi"

[packages]
humanfriendly = "*"
wcmatch = "*"
pyyaml = "*"

[dev-packages]
mypy = "*"
types-pyyaml = "*"

[requires]
python_version = "3"

+ 59
- 1
Pipfile.lock View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "4f504c785e3ed5b203a82a5f40516507f80a01b8d1d0ad5a905f139cafc41a51"
"sha256": "902260ee06bc3bac3fe1ea87c09d4fc28e5aceef95635b3c72b43b6905050278"
},
"pipfile-spec": 6,
"requires": {
@@ -16,6 +16,13 @@
]
},
"default": {
"bracex": {
"hashes": [
"sha256:01f715cd0ed7a622ec8b32322e715813f7574de531f09b70f6f3b2c10f682425",
"sha256:64e2a6d14de9c8e022cf40539ac8468ba7c4b99550a2b05fc87fd20e392e568f"
],
"version": "==2.1.1"
},
"humanfriendly": {
"hashes": [
"sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477",
@@ -23,6 +30,49 @@
],
"index": "pypi",
"version": "==10.0"
},
"pyyaml": {
"hashes": [
"sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf",
"sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696",
"sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393",
"sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77",
"sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922",
"sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5",
"sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8",
"sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10",
"sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc",
"sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018",
"sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e",
"sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253",
"sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347",
"sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183",
"sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541",
"sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb",
"sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185",
"sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc",
"sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db",
"sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa",
"sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46",
"sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122",
"sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b",
"sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63",
"sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df",
"sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc",
"sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247",
"sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6",
"sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"
],
"index": "pypi",
"version": "==5.4.1"
},
"wcmatch": {
"hashes": [
"sha256:4d54ddb506c90b5a5bba3a96a1cfb0bb07127909e19046a71d689ddfb18c3617",
"sha256:9146b1ab9354e0797ef6ef69bc89cb32cb9f46d1b9eeef69c559aeec8f3bffb6"
],
"index": "pypi",
"version": "==8.2"
}
},
"develop": {
@@ -69,6 +119,14 @@
],
"version": "==0.10.2"
},
"types-pyyaml": {
"hashes": [
"sha256:1d9e431e9f1f78a65ea957c558535a3b15ad67ea4912bce48a6c1b613dcf81ad",
"sha256:f1d1357168988e45fa20c65aecb3911462246a84809015dd889ebf8b1db74124"
],
"index": "pypi",
"version": "==5.4.10"
},
"typing-extensions": {
"hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",


+ 2
- 2
README.md View File

@@ -6,7 +6,7 @@ Run on client:
sudo git clone https://git.jim.sh/jim/borg-setup.git /opt/borg
sudo /opt/borg/initial-setup.sh

Customize `/opt/borg/backup.yaml` as desired.
Customize `/opt/borg/config.yaml` as desired.



@@ -98,6 +98,6 @@ Design
/etc/systemd/system/borg-backup.timer -> /opt/borg/borg-backup.timer

- Backup script `/opt/borg/backup.py` uses configuration in
`/opt/borg/backup.yaml` to generate our own list of files, excluding
`/opt/borg/config.yaml` to generate our own list of files, excluding
anything that's too large by default. This requires borg 1.2.0b1
or newer, which is why the setup scripts download a specific version.

+ 162
- 135
backup.py View File

@@ -1,62 +1,106 @@
#!.venv/bin/python

# Scan filesystem to generate a list of files to back up, based on a
# configuration file. Pass this list to borg to actually create the
# backup. Execute a notification script on the remote server to
# report the backup status.

import os
import re
import sys
import stat
from typing import Optional, Tuple
import humanfriendly # type: ignore
import wcmatch.glob # type: ignore
import re
import dataclasses
import enum

class MatchResult(enum.Enum):
INCLUDE_IF_SIZE_OK = 0
INCLUDE_ALWAYS = 1
EXCLUDE_ALWAYS = 2

@dataclasses.dataclass
class PatternRule:
re_inc: list[re.Pattern]
re_exc: list[re.Pattern]

def match(self, path: str) -> Tuple[bool, bool]:
if "big" in path:
print(self, file=sys.stderr)

for inc in self.re_inc:
if inc.match(path):
break
else:
return

for exc in self.re_exc:
if exc.match(path):
return False
return True

class Lister:
def __init__(self, one_file_system: bool, max_size: bool):
self.one_file_system = one_file_system
self.max_size = max_size
if max_size is None:
max_size = float('inf')
self.stdout = os.fdopen(sys.stdout.fileno(), "wb", closefd=False)
import pathlib

# Remember files we've skipped because they were too big, so that
# we can warn again at the end.
self.skipped_size: set[bytes] = set()
import typing

# Remember errors
self.skipped_error: set[bytes] = set()
import yaml
import wcmatch.glob # type: ignore
import humanfriendly # type: ignore

def __del__(self):
self.stdout.close()
class Config:
root: str
max_file_size: typing.Optional[int]
one_file_system: bool
exclude: list[bytes]
force_include: list[bytes]
notify_email: typing.Optional[str]

def __init__(self, configfile: str):
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.root = config['root'].encode()
self.one_file_system = config.get('one-file-system', False)

if 'max-file-size' in config:
self.max_file_size = humanfriendly.parse_size(
config['max-file-size'])
else:
self.max_file_size = None

utf = config.get('exclude', '').encode()
self.exclude = list(filter(len, utf.split(b'\n')))

utf = config.get('force-include', '').encode()
self.force_include = list(filter(len, utf.split(b'\n')))

self.notify_email = config.get('notify-email', None)

# Compile patterns
flags = (wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE)

# Path matches if it matches at least one regex in "a" and no
# regex in "b"
(a, b) = wcmatch.glob.translate(self.exclude, flags=flags)
self.exclude_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])

(a, b) = wcmatch.glob.translate(self.force_include, flags=flags)
self.force_include_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])

def match_compiled(self, re: tuple[list[typing.Pattern],
list[typing.Pattern]],
path: bytes):
# Path matches if it matches at least one regex in
# re[0] and no regex in re[1]
for a in re[0]:
if a.match(path):
for b in re[1]:
if b.match(path):
return False
return True
return False

def __str__(self):
d = { 'root': self.root }
if self.max_file_size:
d['max-file-size'] = self.max_file_size
if self.exclude:
utf = b'\n'.join(self.exclude)
d['exclude'] = utf.decode(errors='backslashreplace')
if self.force_include:
utf = b'\n'.join(self.force_include)
d['force-include'] = utf.decode(errors='backslashreplace')
if self.notify_email:
d['notify-email'] = self.notify_email
return yaml.dump(d, default_flow_style=False)

class Backup:
def __init__(self, config: Config, dry_run: bool, out: typing.BinaryIO):
self.config = config
self.outfile = out
self.dry_run = dry_run

# All logged messages, with severity
self.logs: list[tuple[str, str]] = []

def out(self, path: bytes):
# Use '\0\n' as a separator, so that we can both separate it
# cleanly in Borg, and also view it on stdout.
self.stdout.write(path + b'\0\n')
self.outfile.write(path + (b'\n' if self.dry_run else b'\0'))

def log(self, letter: str, msg: str):
colors = { 'E': 31, 'W': 33, 'I': 36 };
@@ -65,93 +109,78 @@ class Lister:
else:
c = 0
sys.stderr.write(f"\033[1;{c}m{letter}:\033[22m {msg}\033[0m\n")
self.logs.append((letter, msg))

def run(self):
self.scan(self.config.root)

def scan(self, path: bytes,
parent_st: os.stat_result=None,
rules: list[PatternRule]=[]):
def scan(self, path: bytes, parent_st: os.stat_result=None):
"""If the given path should be backed up, print it. If it's
a directory and its contents should be included, recurse."""
a directory and its contents should be included, recurse."""

# Copy the path in string form, for logging and pathspec
# parsing. Otherwise, we use bytes directly.
# Copy the path in string form, for logging. Otherwise, we use
# bytes directly.
pathstr = path.decode(errors='backslashreplace')

try:
# See if we match any rules
for r in rules:
if r.match(pathstr):
self.log('I', f"ignore {pathstr}")
return

# Stat the path
st = os.lstat(path)
is_dir = stat.S_ISDIR(st.st_mode)

if is_dir:
# Skip if it crosses a mount point
if self.one_file_system:
if parent_st is not None and st.st_dev != parent_st.st_dev:
self.log('I', f"skipping {pathstr}: "
"on different filesystem")
return

# Add contents of any .nobackup file to our
# parser rules
child_rules = rules

try:
def prepend_base(regex):
if regex[0] != '^':
raise Exception(f'bad regex: {regex}')
return '^' + os.path.join(pathstr, '') + regex[1:]
with open(os.path.join(path, b".nobackup")) as f:
rule = PatternRule([], [])
for line in f:
if line[0] == '#':
continue
(inc, exc) = wcmatch.glob.translate(
[ line.rstrip('\r\n') ],
flags=(wcmatch.glob.NEGATE |
wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE))
for x in inc:
rule.re_inc.append(re.compile(prepend_base(x)))
for x in exc:
rule.re_exc.append(re.compile(prepend_base(x)))
child_rules.append(rule)
except FileNotFoundError:
pass

# Recurse and process each entry
# See if this path should be excluded or force-included

# Only stat the file when we need it
cached_st = None
def st():
nonlocal cached_st
if not cached_st:
cached_st = os.lstat(path)
return cached_st

# See if there's a reason to exclude it
exclude_reason = None

if self.config.match_compiled(self.config.exclude_re, path):
# Config file says to exclude
exclude_reason = ('I', f"skipping, excluded by config file")

elif (stat.S_ISDIR(st().st_mode)
and self.config.one_file_system
and parent_st is not None
and st().st_dev != parent_st.st_dev):
# Crosses a mount point
exclude_reason = ('I', "skipping, on different filesystem")

elif (stat.S_ISREG(st().st_mode)
and self.config.max_file_size
and st().st_size > self.config.max_file_size):
# Too big
def format_size(n):
return humanfriendly.format_size(
n, keep_width=True, binary=True)
a = format_size(st().st_size)
b = format_size(self.config.max_file_size)
exclude_reason = ('W', f"file size {a} exceeds limit {b}")

# If we have a reason to exclude it, stop now unless it's
# force-included
if (exclude_reason
and not self.config.match_compiled(
self.config.force_include_re, path)):

self.log(exclude_reason[0], f"{exclude_reason[1]}: {pathstr}")
return

# Print name of this path
self.out(path)

# If it's a directory, recurse
if stat.S_ISDIR(st().st_mode):
with os.scandir(path) as it:
for entry in it:
self.scan(entry.path, st, child_rules)

else:
# For regular files, ensure they're not too big
if stat.S_ISREG(st.st_mode) and st.st_size > self.max_size:
def format_size(n):
return humanfriendly.format_size(
n, keep_width=True, binary=True)
a = format_size(st.st_size)
b = format_size(self.max_size)
self.log('W', f"skipping {pathstr}: "
+ f"file size {a} exceeds limit {b}")
self.skipped_size.add(path)
return

# Every other filename gets printed; devices, symlinks, etc
# will get handled by Borg
self.out(path)
self.scan(path=entry.path, parent_st=st())

except PermissionError as e:
self.log('E', f"can't read {pathstr}")
self.skipped_error.add(path)
return

def main(argv):
def main(argv: list[str]):
import argparse

def humansize(string):
@@ -159,21 +188,19 @@ def main(argv):

parser = argparse.ArgumentParser(
prog=argv[0],
description="Build up a directory and file list for backups")
description="Back up the local system using borg",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('-s', '--max-size', type=humansize,
help="Ignore files bigger than this, by default")
parser.add_argument('-x', '--one-file-system', action='store_true',
help="Don't cross mount points when recursing")
parser.add_argument('dirs', metavar='DIR', nargs='+',
help="Root directories to scan recursively")
default_config = str(pathlib.Path(__file__).parent / "config.yaml")
parser.add_argument('-c', '--config',
help="Config file", default=default_config)
parser.add_argument('-n', '--dry-run', action="store_true",
help="Just print filenames, don't run borg")

args = parser.parse_args()

lister = Lister(one_file_system=args.one_file_system,
max_size=args.max_size)
for p in args.dirs:
lister.scan(os.fsencode(p))
config = Config(args.config)
backup = Backup(config, args.dry_run, sys.stdout.buffer)
backup.run()

if __name__ == "__main__":
import sys


+ 23
- 0
config.yaml View File

@@ -0,0 +1,23 @@
root: "/tmp"
one-file-system: true

# Files larger than this are excluded. If a large file isn't
# explicitly mentioned in "excludes" below, it also generates a
# warning.
max-file-size: 500MiB

# Files/dirs to exclude from backup.
# Paths should be absolute, or start with **/
exclude: |
**/Steam/steamapps
**/Steam/ubuntu*
/tmp/bigfile
/tmp/out.ps

# Files that are always included, even if they would have been
# excluded due to file size or the "exclude" list.
# Paths should be absolute, or start with **/
force-include: |

# Email address for notification at end of backup
notify-email: jim@jim.sh

+ 1
- 0
initial-setup.sh View File

@@ -192,6 +192,7 @@ EOF
run_ssh_command "if cmp -s $backup $keys; then rm $backup ; fi"
run_ssh_command "cat >> .ssh/authorized_keys" <<EOF
command="$cmd --append-only",restrict $(cat "$SSH/id_ecdsa_appendonly.pub")
command="borg/notify.sh",restrict $(cat "$SSH/id_ecdsa_appendonly.pub")
command="$cmd",restrict $(cat "$SSH/id_ecdsa.pub")
EOF



Loading…
Cancel
Save