Browse Source

Implement filesystem scanning with configurable filters

master
Jim Paris 2 years ago
parent
commit
0039ca1ee0
7 changed files with 252 additions and 140 deletions
  1. +2
    -2
      Makefile
  2. +3
    -0
      Pipfile
  3. +59
    -1
      Pipfile.lock
  4. +2
    -2
      README.md
  5. +162
    -135
      backup.py
  6. +23
    -0
      config.yaml
  7. +1
    -0
      initial-setup.sh

+ 2
- 2
Makefile View File

@@ -10,7 +10,7 @@ all:
@echo @echo


.PHONY: ctrl .PHONY: ctrl
ctrl: test-setup
ctrl: test-backup


.venv: .venv:
mkdir .venv mkdir .venv
@@ -19,7 +19,7 @@ ctrl: test-setup
.PHONY: test-backup .PHONY: test-backup
test-backup: .venv test-backup: .venv
.venv/bin/mypy backup.py .venv/bin/mypy backup.py
./backup.py --max-size 1GiB --one-file-system /tmp | grep -a 'bigf'
./backup.py -n >/dev/null


.PHONY: test-setup .PHONY: test-setup
test-setup: test-setup:


+ 3
- 0
Pipfile View File

@@ -5,9 +5,12 @@ name = "pypi"


[packages] [packages]
humanfriendly = "*" humanfriendly = "*"
wcmatch = "*"
pyyaml = "*"


[dev-packages] [dev-packages]
mypy = "*" mypy = "*"
types-pyyaml = "*"


[requires] [requires]
python_version = "3" python_version = "3"

+ 59
- 1
Pipfile.lock View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "4f504c785e3ed5b203a82a5f40516507f80a01b8d1d0ad5a905f139cafc41a51"
"sha256": "902260ee06bc3bac3fe1ea87c09d4fc28e5aceef95635b3c72b43b6905050278"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -16,6 +16,13 @@
] ]
}, },
"default": { "default": {
"bracex": {
"hashes": [
"sha256:01f715cd0ed7a622ec8b32322e715813f7574de531f09b70f6f3b2c10f682425",
"sha256:64e2a6d14de9c8e022cf40539ac8468ba7c4b99550a2b05fc87fd20e392e568f"
],
"version": "==2.1.1"
},
"humanfriendly": { "humanfriendly": {
"hashes": [ "hashes": [
"sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477",
@@ -23,6 +30,49 @@
], ],
"index": "pypi", "index": "pypi",
"version": "==10.0" "version": "==10.0"
},
"pyyaml": {
"hashes": [
"sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf",
"sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696",
"sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393",
"sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77",
"sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922",
"sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5",
"sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8",
"sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10",
"sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc",
"sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018",
"sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e",
"sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253",
"sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347",
"sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183",
"sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541",
"sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb",
"sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185",
"sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc",
"sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db",
"sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa",
"sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46",
"sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122",
"sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b",
"sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63",
"sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df",
"sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc",
"sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247",
"sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6",
"sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"
],
"index": "pypi",
"version": "==5.4.1"
},
"wcmatch": {
"hashes": [
"sha256:4d54ddb506c90b5a5bba3a96a1cfb0bb07127909e19046a71d689ddfb18c3617",
"sha256:9146b1ab9354e0797ef6ef69bc89cb32cb9f46d1b9eeef69c559aeec8f3bffb6"
],
"index": "pypi",
"version": "==8.2"
} }
}, },
"develop": { "develop": {
@@ -69,6 +119,14 @@
], ],
"version": "==0.10.2" "version": "==0.10.2"
}, },
"types-pyyaml": {
"hashes": [
"sha256:1d9e431e9f1f78a65ea957c558535a3b15ad67ea4912bce48a6c1b613dcf81ad",
"sha256:f1d1357168988e45fa20c65aecb3911462246a84809015dd889ebf8b1db74124"
],
"index": "pypi",
"version": "==5.4.10"
},
"typing-extensions": { "typing-extensions": {
"hashes": [ "hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e", "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",


+ 2
- 2
README.md View File

@@ -6,7 +6,7 @@ Run on client:
sudo git clone https://git.jim.sh/jim/borg-setup.git /opt/borg sudo git clone https://git.jim.sh/jim/borg-setup.git /opt/borg
sudo /opt/borg/initial-setup.sh sudo /opt/borg/initial-setup.sh


Customize `/opt/borg/backup.yaml` as desired.
Customize `/opt/borg/config.yaml` as desired.






@@ -98,6 +98,6 @@ Design
/etc/systemd/system/borg-backup.timer -> /opt/borg/borg-backup.timer /etc/systemd/system/borg-backup.timer -> /opt/borg/borg-backup.timer


- Backup script `/opt/borg/backup.py` uses configuration in - Backup script `/opt/borg/backup.py` uses configuration in
`/opt/borg/backup.yaml` to generate our own list of files, excluding
`/opt/borg/config.yaml` to generate our own list of files, excluding
anything that's too large by default. This requires borg 1.2.0b1 anything that's too large by default. This requires borg 1.2.0b1
or newer, which is why the setup scripts download a specific version. or newer, which is why the setup scripts download a specific version.

+ 162
- 135
backup.py View File

@@ -1,62 +1,106 @@
#!.venv/bin/python #!.venv/bin/python


# Scan filesystem to generate a list of files to back up, based on a
# configuration file. Pass this list to borg to actually create the
# backup. Execute a notification script on the remote server to
# report the backup status.

import os import os
import re
import sys import sys
import stat import stat
from typing import Optional, Tuple
import humanfriendly # type: ignore
import wcmatch.glob # type: ignore
import re
import dataclasses
import enum

class MatchResult(enum.Enum):
INCLUDE_IF_SIZE_OK = 0
INCLUDE_ALWAYS = 1
EXCLUDE_ALWAYS = 2

@dataclasses.dataclass
class PatternRule:
re_inc: list[re.Pattern]
re_exc: list[re.Pattern]

def match(self, path: str) -> Tuple[bool, bool]:
if "big" in path:
print(self, file=sys.stderr)

for inc in self.re_inc:
if inc.match(path):
break
else:
return

for exc in self.re_exc:
if exc.match(path):
return False
return True

class Lister:
def __init__(self, one_file_system: bool, max_size: bool):
self.one_file_system = one_file_system
self.max_size = max_size
if max_size is None:
max_size = float('inf')
self.stdout = os.fdopen(sys.stdout.fileno(), "wb", closefd=False)
import pathlib


# Remember files we've skipped because they were too big, so that
# we can warn again at the end.
self.skipped_size: set[bytes] = set()
import typing


# Remember errors
self.skipped_error: set[bytes] = set()
import yaml
import wcmatch.glob # type: ignore
import humanfriendly # type: ignore


def __del__(self):
self.stdout.close()
class Config:
root: str
max_file_size: typing.Optional[int]
one_file_system: bool
exclude: list[bytes]
force_include: list[bytes]
notify_email: typing.Optional[str]

def __init__(self, configfile: str):
# Read config
with open(configfile, 'r') as f:
config = yaml.safe_load(f)
self.root = config['root'].encode()
self.one_file_system = config.get('one-file-system', False)

if 'max-file-size' in config:
self.max_file_size = humanfriendly.parse_size(
config['max-file-size'])
else:
self.max_file_size = None

utf = config.get('exclude', '').encode()
self.exclude = list(filter(len, utf.split(b'\n')))

utf = config.get('force-include', '').encode()
self.force_include = list(filter(len, utf.split(b'\n')))

self.notify_email = config.get('notify-email', None)

# Compile patterns
flags = (wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.NODOTDIR |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE)

# Path matches if it matches at least one regex in "a" and no
# regex in "b"
(a, b) = wcmatch.glob.translate(self.exclude, flags=flags)
self.exclude_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])

(a, b) = wcmatch.glob.translate(self.force_include, flags=flags)
self.force_include_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])

def match_compiled(self, re: tuple[list[typing.Pattern],
list[typing.Pattern]],
path: bytes):
# Path matches if it matches at least one regex in
# re[0] and no regex in re[1]
for a in re[0]:
if a.match(path):
for b in re[1]:
if b.match(path):
return False
return True
return False

def __str__(self):
d = { 'root': self.root }
if self.max_file_size:
d['max-file-size'] = self.max_file_size
if self.exclude:
utf = b'\n'.join(self.exclude)
d['exclude'] = utf.decode(errors='backslashreplace')
if self.force_include:
utf = b'\n'.join(self.force_include)
d['force-include'] = utf.decode(errors='backslashreplace')
if self.notify_email:
d['notify-email'] = self.notify_email
return yaml.dump(d, default_flow_style=False)

class Backup:
def __init__(self, config: Config, dry_run: bool, out: typing.BinaryIO):
self.config = config
self.outfile = out
self.dry_run = dry_run

# All logged messages, with severity
self.logs: list[tuple[str, str]] = []


def out(self, path: bytes): def out(self, path: bytes):
# Use '\0\n' as a separator, so that we can both separate it
# cleanly in Borg, and also view it on stdout.
self.stdout.write(path + b'\0\n')
self.outfile.write(path + (b'\n' if self.dry_run else b'\0'))


def log(self, letter: str, msg: str): def log(self, letter: str, msg: str):
colors = { 'E': 31, 'W': 33, 'I': 36 }; colors = { 'E': 31, 'W': 33, 'I': 36 };
@@ -65,93 +109,78 @@ class Lister:
else: else:
c = 0 c = 0
sys.stderr.write(f"\033[1;{c}m{letter}:\033[22m {msg}\033[0m\n") sys.stderr.write(f"\033[1;{c}m{letter}:\033[22m {msg}\033[0m\n")
self.logs.append((letter, msg))

def run(self):
self.scan(self.config.root)


def scan(self, path: bytes,
parent_st: os.stat_result=None,
rules: list[PatternRule]=[]):
def scan(self, path: bytes, parent_st: os.stat_result=None):
"""If the given path should be backed up, print it. If it's """If the given path should be backed up, print it. If it's
a directory and its contents should be included, recurse."""
a directory and its contents should be included, recurse."""


# Copy the path in string form, for logging and pathspec
# parsing. Otherwise, we use bytes directly.
# Copy the path in string form, for logging. Otherwise, we use
# bytes directly.
pathstr = path.decode(errors='backslashreplace') pathstr = path.decode(errors='backslashreplace')


try: try:
# See if we match any rules
for r in rules:
if r.match(pathstr):
self.log('I', f"ignore {pathstr}")
return

# Stat the path
st = os.lstat(path)
is_dir = stat.S_ISDIR(st.st_mode)

if is_dir:
# Skip if it crosses a mount point
if self.one_file_system:
if parent_st is not None and st.st_dev != parent_st.st_dev:
self.log('I', f"skipping {pathstr}: "
"on different filesystem")
return

# Add contents of any .nobackup file to our
# parser rules
child_rules = rules

try:
def prepend_base(regex):
if regex[0] != '^':
raise Exception(f'bad regex: {regex}')
return '^' + os.path.join(pathstr, '') + regex[1:]
with open(os.path.join(path, b".nobackup")) as f:
rule = PatternRule([], [])
for line in f:
if line[0] == '#':
continue
(inc, exc) = wcmatch.glob.translate(
[ line.rstrip('\r\n') ],
flags=(wcmatch.glob.NEGATE |
wcmatch.glob.GLOBSTAR |
wcmatch.glob.DOTGLOB |
wcmatch.glob.EXTGLOB |
wcmatch.glob.BRACE))
for x in inc:
rule.re_inc.append(re.compile(prepend_base(x)))
for x in exc:
rule.re_exc.append(re.compile(prepend_base(x)))
child_rules.append(rule)
except FileNotFoundError:
pass

# Recurse and process each entry
# See if this path should be excluded or force-included

# Only stat the file when we need it
cached_st = None
def st():
nonlocal cached_st
if not cached_st:
cached_st = os.lstat(path)
return cached_st

# See if there's a reason to exclude it
exclude_reason = None

if self.config.match_compiled(self.config.exclude_re, path):
# Config file says to exclude
exclude_reason = ('I', f"skipping, excluded by config file")

elif (stat.S_ISDIR(st().st_mode)
and self.config.one_file_system
and parent_st is not None
and st().st_dev != parent_st.st_dev):
# Crosses a mount point
exclude_reason = ('I', "skipping, on different filesystem")

elif (stat.S_ISREG(st().st_mode)
and self.config.max_file_size
and st().st_size > self.config.max_file_size):
# Too big
def format_size(n):
return humanfriendly.format_size(
n, keep_width=True, binary=True)
a = format_size(st().st_size)
b = format_size(self.config.max_file_size)
exclude_reason = ('W', f"file size {a} exceeds limit {b}")

# If we have a reason to exclude it, stop now unless it's
# force-included
if (exclude_reason
and not self.config.match_compiled(
self.config.force_include_re, path)):

self.log(exclude_reason[0], f"{exclude_reason[1]}: {pathstr}")
return

# Print name of this path
self.out(path)

# If it's a directory, recurse
if stat.S_ISDIR(st().st_mode):
with os.scandir(path) as it: with os.scandir(path) as it:
for entry in it: for entry in it:
self.scan(entry.path, st, child_rules)

else:
# For regular files, ensure they're not too big
if stat.S_ISREG(st.st_mode) and st.st_size > self.max_size:
def format_size(n):
return humanfriendly.format_size(
n, keep_width=True, binary=True)
a = format_size(st.st_size)
b = format_size(self.max_size)
self.log('W', f"skipping {pathstr}: "
+ f"file size {a} exceeds limit {b}")
self.skipped_size.add(path)
return

# Every other filename gets printed; devices, symlinks, etc
# will get handled by Borg
self.out(path)
self.scan(path=entry.path, parent_st=st())


except PermissionError as e: except PermissionError as e:
self.log('E', f"can't read {pathstr}") self.log('E', f"can't read {pathstr}")
self.skipped_error.add(path)
return return


def main(argv):
def main(argv: list[str]):
import argparse import argparse


def humansize(string): def humansize(string):
@@ -159,21 +188,19 @@ def main(argv):


parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog=argv[0], prog=argv[0],
description="Build up a directory and file list for backups")
description="Back up the local system using borg",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)


parser.add_argument('-s', '--max-size', type=humansize,
help="Ignore files bigger than this, by default")
parser.add_argument('-x', '--one-file-system', action='store_true',
help="Don't cross mount points when recursing")
parser.add_argument('dirs', metavar='DIR', nargs='+',
help="Root directories to scan recursively")
default_config = str(pathlib.Path(__file__).parent / "config.yaml")
parser.add_argument('-c', '--config',
help="Config file", default=default_config)
parser.add_argument('-n', '--dry-run', action="store_true",
help="Just print filenames, don't run borg")


args = parser.parse_args() args = parser.parse_args()

lister = Lister(one_file_system=args.one_file_system,
max_size=args.max_size)
for p in args.dirs:
lister.scan(os.fsencode(p))
config = Config(args.config)
backup = Backup(config, args.dry_run, sys.stdout.buffer)
backup.run()


if __name__ == "__main__": if __name__ == "__main__":
import sys import sys


+ 23
- 0
config.yaml View File

@@ -0,0 +1,23 @@
root: "/tmp"
one-file-system: true

# Files larger than this are excluded. If a large file isn't
# explicitly mentioned in "excludes" below, it also generates a
# warning.
max-file-size: 500MiB

# Files/dirs to exclude from backup.
# Paths should be absolute, or start with **/
exclude: |
**/Steam/steamapps
**/Steam/ubuntu*
/tmp/bigfile
/tmp/out.ps

# Files that are always included, even if they would have been
# excluded due to file size or the "exclude" list.
# Paths should be absolute, or start with **/
force-include: |

# Email address for notification at end of backup
notify-email: jim@jim.sh

+ 1
- 0
initial-setup.sh View File

@@ -192,6 +192,7 @@ EOF
run_ssh_command "if cmp -s $backup $keys; then rm $backup ; fi" run_ssh_command "if cmp -s $backup $keys; then rm $backup ; fi"
run_ssh_command "cat >> .ssh/authorized_keys" <<EOF run_ssh_command "cat >> .ssh/authorized_keys" <<EOF
command="$cmd --append-only",restrict $(cat "$SSH/id_ecdsa_appendonly.pub") command="$cmd --append-only",restrict $(cat "$SSH/id_ecdsa_appendonly.pub")
command="borg/notify.sh",restrict $(cat "$SSH/id_ecdsa_appendonly.pub")
command="$cmd",restrict $(cat "$SSH/id_ecdsa.pub") command="$cmd",restrict $(cat "$SSH/id_ecdsa.pub")
EOF EOF




Loading…
Cancel
Save