Browse Source

Rework how exclude/include pattern matching works a bit

Jim Paris 1 month ago
2 changed files with 88 additions and 57 deletions
  1. +76
  2. +12

+ 76
- 49 View File

@@ -21,6 +21,7 @@ class Config:
root: bytes
max_file_size: typing.Optional[int]
one_file_system: bool
exclude_caches: bool
exclude: list[bytes]
force_include: list[bytes]
notify_email: typing.Optional[str]
@@ -31,6 +32,7 @@ class Config:
config = yaml.safe_load(f)
self.root = config['root'].encode()
self.one_file_system = config.get('one-file-system', False)
self.exclude_caches = config.get('exclude-caches', False)

if 'max-file-size' in config:
self.max_file_size = humanfriendly.parse_size(
@@ -38,11 +40,21 @@ class Config:
self.max_file_size = None

utf = config.get('exclude', '').encode()
self.exclude = list(filter(len, utf.split(b'\n')))

utf = config.get('force-include', '').encode()
self.force_include = list(filter(len, utf.split(b'\n')))
def process_match_list(config_name):
raw = config.get(config_name, '').encode().split(b'\n')
pats = []
# Prepend '**/' to any relative patterns
for x in raw:
if not len(x):
if x[0] == b'/':
pats.append(b'**/' + x)
return pats

self.exclude = process_match_list('exclude')
self.force_include = process_match_list('force_include')

self.notify_email = config.get('notify-email', None)

@@ -63,11 +75,15 @@ class Config:
self.force_include_re = ([ re.compile(x) for x in a ],
[ re.compile(x) for x in b ])

def match_compiled(self, re: tuple[list[typing.Pattern],
path: bytes):
def match_re(self, re: tuple[list[typing.Pattern],
path: bytes, is_dir: bool):
# If it's a directory, try matching against a trailing slash
# first.
if is_dir and self.match_re(re, path + b'/', False):
return True
# Path matches if it matches at least one regex in
# re[0] and no regex in re[1]
# re[0] and no regex in re[1].
for a in re[0]:
if a.match(path):
for b in re[1]:
@@ -76,20 +92,6 @@ class Config:
return True
return False

def __str__(self):
d = { 'root': self.root }
if self.max_file_size:
d['max-file-size'] = self.max_file_size
if self.exclude:
utf = b'\n'.join(self.exclude)
d['exclude'] = utf.decode(errors='backslashreplace')
if self.force_include:
utf = b'\n'.join(self.force_include)
d['force-include'] = utf.decode(errors='backslashreplace')
if self.notify_email:
d['notify-email'] = self.notify_email
return yaml.dump(d, default_flow_style=False)

class Backup:
def __init__(self, config: Config, dry_run: bool):
self.config = config
@@ -112,69 +114,94 @@ class Backup:

def run(self, outfile: typing.BinaryIO):
self.outfile = outfile
# Base should not end with a slash, but full path should
if self.config.root.endswith(b'/'):
base = self.config.root[:-1]
path = self.config.root
base = self.config.root
path = self.config.root + b'/'
self.scan(base, path)

def scan(self, path: bytes, parent_st: os.stat_result=None):
def scan(self, base: bytes, path: bytes,
parent_st: os.stat_result=None):
"""If the given path should be backed up, print it. If it's
a directory and its contents should be included, recurse."""
a directory and its contents should be included, recurse.

if base.endswith(b'/'):
raise Exception("base must not end with /")
relpath = path[len(base):]
if not relpath.startswith(b'/'):
raise Exception(f"relative path (from {repr(base)}, {repr(path)})"
+ f" must start with /")

# Copy the path in string form, for logging. Otherwise, we use
# bytes directly.
pathstr = path.decode(errors='backslashreplace')

# See if this path should be excluded or force-included

# Only stat the file when we need it
cached_st = None
def st():
nonlocal cached_st
if not cached_st:
cached_st = os.lstat(path)
return cached_st
st = os.lstat(path)
is_dir = stat.S_ISDIR(st.st_mode)
is_reg = stat.S_ISREG(st.st_mode)

# See if there's a reason to exclude it
exclude_reason = None

if self.config.match_compiled(self.config.exclude_re, path):
if self.config.match_re(self.config.exclude_re, relpath, is_dir):
# Config file says to exclude
exclude_reason = ('I', f"skipping, excluded by config file")

elif (self.config.one_file_system
and parent_st is not None
and stat.S_ISDIR(st().st_mode)
and st().st_dev != parent_st.st_dev):
and is_dir
and st.st_dev != parent_st.st_dev):
# Crosses a mount point
exclude_reason = ('I', "skipping, on different filesystem")

elif (self.config.max_file_size
and stat.S_ISREG(st().st_mode)
and (st().st_blocks * 512) > self.config.max_file_size):
and is_reg
and (st.st_blocks * 512) > self.config.max_file_size):
# Too big
def format_size(n):
return humanfriendly.format_size(
n, keep_width=True, binary=True)
a = format_size(st().st_size)
a = format_size(st.st_size)
b = format_size(self.config.max_file_size)
exclude_reason = ('W', f"file size {a} exceeds limit {b}")

# If we have a reason to exclude it, stop now unless it's
# force-included
if (exclude_reason
and not self.config.match_compiled(
self.config.force_include_re, path)):

force = self.config.match_re(
self.config.force_include_re, relpath, is_dir)
if exclude_reason and not force:
self.log(exclude_reason[0], f"{exclude_reason[1]}: {pathstr}")

# Print name of this path
# Print path for Borg

# If it's a directory, recurse
if stat.S_ISDIR(st().st_mode):
# Process directories
if is_dir:

# Skip if it contains CACHEDIR.TAG
# (mirroring the --exclude-caches borg option)
if self.config.exclude_caches:
tag = b'Signature: 8a477f597d28d172789f06886806bc55'
with open(path + b'/CACHEDIR.TAG', 'rb') as f:
if == tag:
'I', f"skipping cache dir: {pathstr}")

# Recurse
with os.scandir(path) as it:
for entry in it:
self.scan(path=entry.path, parent_st=st())
self.scan(base=base, path=entry.path,

except PermissionError as e:
self.log('E', f"can't read {pathstr}")

+ 12
- 8
config.yaml View File

@@ -1,5 +1,6 @@
root: "/"
root: "/tmp/test"
one-file-system: true
exclude-caches: true

# Files larger than this are excluded. If a large file isn't
# explicitly mentioned in "excludes" below, it also generates a
@@ -8,17 +9,20 @@ one-file-system: true
max-file-size: 500MiB

# Files/dirs to exclude from backup.
# Paths should be absolute, or start with **/
# Absolute paths here start at the root directory.
# Relative paths are treated as if starting with **/
# Paths ending in / will only match directories.
exclude: |

# Files that are always included, even if they would have been
# excluded due to file size or the "exclude" list.
# Paths should be absolute, or start with **/
# Matching rules are the same as above.
force-include: |

# Email address for notification at end of backup