Issue
I'm having trouble with a regex (in Python 3.9), been digging for days still blocked.
Here's the expression:
^(?P<head>(?<=^).+?)??(?P<mid>(?:(?<=^)|[.])x+)?(?P<tail>[.][^/]+?)?(?<!^)$
This matches:
xx # works, goes in mid
a.xx # works, head "a", mid ".xx"
a.b.xx # FAILS, head "a", tails consumes the rest, why?
a.xx.c # works, head "a", mid ".xx", tail ".c"
a.b.xx.c # FAILS, head "a", tails consumes the rest, why?
- a.b.xx: I need "a.b" as head, ".xx" as mid, no tail
- a.b.xx.c: I need "a.b" as head, ".xx" as mid, ".c" as tail
I don't understand why in some cases the tails consumes the mid and in some cases not, any idea? Any idea how I can always just have ".c" in the tail? Many thanks!
Could you describe in words what your regex is supposed to match?
This is for file sequence patterns specific to visual effects, e.g.:
- description.detail.[UDIM token or tile number].[frame number].exr
- description.detail.[UDIM token or tile number].exr
- description.detail.[frame number].exr
- description.detail.u<udim>.@@@@.exr
- description.detail.u1001.@@@@.exr
- description.detail.u1001.1001.exr
- description.detail.u<udim>.exr
- description.detail.u1001.exr
- description.detail.@@@@.exr
- description.detail.1001.exr
- u<udim>.exr
- u1001.exr
- @@@@.exr
- 1001.exr
So we actually have those groups:
- an optional head, general description of the file sequence
- separators as dots but those can be included multiple times in the head
- a UDIM section either as a token or a number, always as a "u" in front
- a frame number section either as padding token or a number
- a tail section, here the file extension but could be more
- we need to detect with either a UDIM, a frame, or both, they are always in this order
I simplified it to the minimum requires to reproduce the problematic behaviour, "mid" group could be either the UDIM or the frame.
More details, actual regex:
import os
from aenum import LowerStrEnum, auto
class GroupName(LowerStrEnum):
"""Regex group name."""
FRAME_NUMBER = auto()
FRAME_SEPARATOR = auto()
FRAME_TOKEN = auto()
UDIM_SEPARATOR = auto()
UDIM_TILE = auto()
UDIM_TOKEN = auto()
def __str__(self) -> str:
return self.value
# Separator for pattern sections.
SEPARATOR = r"[_.]"
# Start or separator.
START_OR_SEPARATOR = rf"(?<=^)|{SEPARATOR}"
# UDIM tile.
UDIM_TILE = rf"(?P<{GroupName.UDIM_TILE}>[\d#@]{{4}})"
# UDIM or UDIM token.
UDIM_TOKEN = rf"(?P<{GroupName.UDIM_TOKEN}><[Uu][Dd][Ii][Mm]>)"
# Padding: '#', '###', '@', '@@', '%d', '%03d', '$F', '$F4'
PADDING = r"#+|@+|%(?:\d+)?d|\$F(?:\d+)?"
# Frame range: '1-2' '1,2' '1, 2' '1-2, 3' '1-2,3' '1,2-3'
FRAME_RANGE = r"-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?"
# Frame token (range/padding) regex.
FRAME_TOKEN = (
rf"(?P<{GroupName.FRAME_TOKEN}>"
r"(?:"
r"(?P<joined_embedded_frame_range>"
rf"(?:{FRAME_RANGE})"
r"|(?:-?\d+)"
r")"
rf"(?P<joined_padding>{PADDING})"
r")"
rf"|(?P<embedded_frame_range>{FRAME_RANGE})"
rf"|(?P<padding>{PADDING})"
r")"
)
# Base for all types of paths.
BASE = (
r"^"
# Head group.
r"(?:"
rf"(?P<directory>(?<=^).*{os.path.sep})?"
rf"(?P<head>[^{os.path.sep}]+?)"
r"(?!$)" # Stop the head from consuming everything next.
r")??"
# UDIM group.
rf"(?:"
rf"(?P<{GroupName.UDIM_SEPARATOR}>{START_OR_SEPARATOR})"
rf"[uU](?:{UDIM_TILE}|{UDIM_TOKEN})"
r")?"
# Frame sequence group.
r"(?:"
rf"(?P<{GroupName.FRAME_SEPARATOR}>{START_OR_SEPARATOR})"
r"(?:"
rf"{FRAME_TOKEN}"
rf"|(?P<{GroupName.FRAME_NUMBER}>-?\d+)"
r")"
r")?"
# Tail & ext group.
r"(?:"
rf"(?P<tail>{SEPARATOR}[^/]+)?"
rf"(?P<ext>{SEPARATOR}[a-zA-Z]+)"
r")?"
rf"(?: \[*(?P<extended_frame_range>{FRAME_RANGE})\]*?)?"
r"(?<!^)$"
)
^(?:(?P<directory>(?<=^).*/)?(?P<head>[^/]+?)(?!$))??(?:(?P<udim_separator>(?<=^)|[_.])[uU](?:(?P<udim_tile>[\d#@]{4})|(?P<udim_token><[Uu][Dd][Ii][Mm]>)))?(?:(?P<frame_separator>(?<=^)|[_.])(?:(?P<frame_token>(?:(?P<joined_embedded_frame_range>(?:-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?:-?\d+))(?P<joined_padding>#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<embedded_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?P<padding>#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<frame_number>-?\d+)))?(?:(?P<tail>[_.][^/]+)?(?P<ext>[_.][a-zA-Z]+))?(?: \[*(?P<extended_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)\]*?)?(?<!^)$
Printing the groupdict for the following strings:
tests/test_paths/udim_seq/head.1001
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': None,
'extended_frame_range': None,
'frame_number': '1001',
'frame_separator': '.',
'frame_token': None,
'head': 'head',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': None,
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': None,
'extended_frame_range': None,
'frame_number': None,
'frame_separator': '.',
'frame_token': '1-2#',
'head': 'book.open.mid',
'joined_embedded_frame_range': '1-2',
'joined_padding': '#',
'padding': None,
'tail': None,
'udim_separator': '.',
'udim_tile': '1001',
'udim_token': None}
tests/test_paths/udim_seq/head.1001.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': '1001',
'frame_separator': '.',
'frame_token': None,
'head': 'head',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': None,
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.1001.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': None,
'frame_separator': None,
'frame_token': None,
'head': 'book',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': '.open.mid.1001',
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': None,
'frame_separator': None,
'frame_token': None,
'head': 'book',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': '.open.mid.u1001.1-2#',
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
As soon as the head includes a separator, the tail consumes everything...
Solution
The issue with your regex is in this group:
(?P<head>[^/]+?)
Because it does a lazy match, when you process the string tests/test_paths/udim_seq/book.open.mid.1001.exr
and the engine gets to the .
after book
, it then proceeds to match the rest of the string with the (?P<tail>[_.][^/]+)
capture group.
You can prevent this happening by enforcing that there is either a UDIM pattern or a frame pattern (or both) in the input. This can be done in the form of a regex like:
(?:A?B|A)
where A
represents the UDIM
pattern and B
the frame pattern. Now since this requires repeating the UDIM
pattern, you'll need to use a PCRE compatible regex engine to refer to an expression (using P>name
notation) to avoid an issue with repeating group names (or, change the group names in the repetition and then merge the values in post-processing). In python you can use the regex
library which supports this feature.
Writing out your regex with spacing for readability:
^
(?:(?P<directory>(?<=^).*/)?(?P<head>[^/]+?)(?!$))??
(?:
(?P<udim>
(?:(?P<udim_separator>(?<=^)|[_.])[uU](?:(?P<udim_tile>[\d#@]{4})|(?P<udim_token><[Uu][Dd][Ii][Mm]>)))
)?
(?:(?P<frame_separator>(?<=^)|[_.])(?:(?P<frame_token>(?:(?P<joined_embedded_frame_range>(?:-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?:-?\d+))(?P<joined_padding>\#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<embedded_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?P<padding>\#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<frame_number>-?\d+)))
|
(?P>udim)
)
(?:(?P<tail>[_.][^/]+)?(?P<ext>[_.][a-zA-Z]+))?
(?: \[*(?P<extended_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)\]*?)?
(?<!^)
$
Regex demo on regex101
In python
import regex
pat = '''^
(?:(?P<directory>(?<=^).*/)?(?P<head>[^/]+?)(?!$))??
(?:
(?P<udim>
(?:(?P<udim_separator>(?<=^)|[_.])[uU](?:(?P<udim_tile>[\d#@]{4})|(?P<udim_token><[Uu][Dd][Ii][Mm]>)))
)?
(?:(?P<frame_separator>(?<=^)|[_.])(?:(?P<frame_token>(?:(?P<joined_embedded_frame_range>(?:-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?:-?\d+))(?P<joined_padding>\#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<embedded_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?P<padding>\#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<frame_number>-?\d+)))
|
(?P>udim)
)
(?:(?P<tail>[_.][^/]+)?(?P<ext>[_.][a-zA-Z]+))?
(?: \[*(?P<extended_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)\]*?)?
(?<!^)
$'''
rgx = regex.compile(pat, re.X)
files = ['tests/test_paths/udim_seq/head.1001',
'tests/test_paths/udim_seq/book.open.mid.u1001.1-2#',
'tests/test_paths/udim_seq/head.1001.exr',
'tests/test_paths/udim_seq/book.open.mid.1001.exr',
'tests/test_paths/udim_seq/book.open.mid.u1001.1-2#.exr'
]
for f in files:
print(f, json.dumps(rgx.match(f).groupdict(), indent=4), sep='\n')
Output:
tests/test_paths/udim_seq/head.1001
{
"directory": "tests/test_paths/udim_seq/",
"head": "head",
"udim": null,
"udim_separator": null,
"udim_tile": null,
"udim_token": null,
"frame_separator": ".",
"frame_token": null,
"joined_embedded_frame_range": null,
"joined_padding": null,
"embedded_frame_range": null,
"padding": null,
"frame_number": "1001",
"tail": null,
"ext": null,
"extended_frame_range": null
}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#
{
"directory": "tests/test_paths/udim_seq/",
"head": "book.open.mid",
"udim": ".u1001",
"udim_separator": ".",
"udim_tile": "1001",
"udim_token": null,
"frame_separator": ".",
"frame_token": "1-2#",
"joined_embedded_frame_range": "1-2",
"joined_padding": "#",
"embedded_frame_range": null,
"padding": null,
"frame_number": null,
"tail": null,
"ext": null,
}
tests/test_paths/udim_seq/head.1001.exr
{
"directory": "tests/test_paths/udim_seq/",
"head": "head",
"udim": null,
"udim_separator": null,
"udim_tile": null,
"udim_token": null,
"frame_separator": ".",
"frame_token": null,
"joined_embedded_frame_range": null,
"joined_padding": null,
"embedded_frame_range": null,
"padding": null,
"frame_number": "1001",
"tail": null,
"ext": ".exr",
"extended_frame_range": null
}
tests/test_paths/udim_seq/book.open.mid.1001.exr
{
"directory": "tests/test_paths/udim_seq/",
"head": "book.open.mid",
"udim": null,
"udim_separator": null,
"udim_tile": null,
"udim_token": null,
"frame_separator": ".",
"frame_token": null,
"joined_embedded_frame_range": null,
"joined_padding": null,
"embedded_frame_range": null,
"padding": null,
"frame_number": "1001",
"tail": null,
"ext": ".exr",
"extended_frame_range": null
}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#.exr
{
"directory": "tests/test_paths/udim_seq/",
"head": "book.open.mid",
"udim": ".u1001",
"udim_separator": ".",
"udim_tile": "1001",
"udim_token": null,
"frame_separator": ".",
"frame_token": "1-2#",
"joined_embedded_frame_range": "1-2",
"joined_padding": "#",
"embedded_frame_range": null,
"padding": null,
"frame_number": null,
"tail": null,
"ext": ".exr",
"extended_frame_range": null
}
Answered By - Nick
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.