codon/test/stdlib/re_test.codon

448 lines
17 KiB
Python

import re
import string
@test
def test_search_star_plus():
assert re.search('x*', 'axx').span(0) == (0, 0)
assert re.search('x*', 'axx').span() == (0, 0)
assert re.search('x+', 'axx').span(0) == (1, 3)
assert re.search('x+', 'axx').span() == (1, 3)
assert re.search('x', 'aaa') is None
assert re.match('a*', 'xxx').span(0) == (0, 0)
assert re.match('a*', 'xxx').span() == (0, 0)
assert re.match('x*', 'xxxa').span(0) == (0, 3)
assert re.match('x*', 'xxxa').span() == (0, 3)
assert re.match('a+', 'xxx') is None
test_search_star_plus()
@test
def test_branching():
"""Test Branching
Test expressions using the OR ('|') operator."""
assert re.match('(ab|ba)', 'ab').span() == (0, 2)
assert re.match('(ab|ba)', 'ba').span() == (0, 2)
assert re.match('(abc|bac|ca|cb)', 'abc').span() == (0, 3)
assert re.match('(abc|bac|ca|cb)', 'bac').span() == (0, 3)
assert re.match('(abc|bac|ca|cb)', 'ca').span() == (0, 2)
assert re.match('(abc|bac|ca|cb)', 'cb').span() == (0, 2)
assert re.match('((a)|(b)|(c))', 'a').span() == (0, 1)
assert re.match('((a)|(b)|(c))', 'b').span() == (0, 1)
assert re.match('((a)|(b)|(c))', 'c').span() == (0, 1)
test_branching()
@test
def test_basic_re_sub():
def bump_num(matchobj):
int_value = int(matchobj.group(0))
return str(int_value + 1)
assert re.sub('y', 'a', 'xyz') == 'xaz'
assert re.sub("(?i)b+", "x", "bbbb BBBB") == 'x x'
assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y') == '9.3 -3 24x100y'
assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y', 3) == '9.3 -3 23x99y'
assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y', count=3) == '9.3 -3 23x99y'
assert re.sub('.', lambda m: r"\n", 'x') == '\\n'
assert re.sub('.', r"\n", 'x') == '\n'
s = r"\1\1"
assert re.sub('(.)', s, 'x') == 'xx'
assert re.sub('(.)', s.replace('\\', r'\\'), 'x') == s
assert re.sub('(.)', lambda m: s, 'x') == s
assert re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx') == 'xxxx'
assert re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx') == 'xxxx'
assert re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx') == 'xxxx'
assert re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx') == 'xxxx'
assert re.sub('()x', r'\g<0>\g<0>', 'xx') == 'xxxx'
assert re.sub('a', r'\t\n\v\r\f\a\b', 'a') == '\t\n\v\r\f\a\b'
assert re.sub('a', '\t\n\v\r\f\a\b', 'a') == '\t\n\v\r\f\a\b'
assert re.sub('a', '\t\n\v\r\f\a\b', 'a') == (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))
for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
try:
re.sub('a', '\\' + c, 'a')
assert False
except re.error:
pass
assert re.sub(r'^\s*', 'X', 'test') == 'Xtest'
test_basic_re_sub()
@test
def test_bug_449964():
assert re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx') == 'xx\bxx\b'
test_bug_449964()
@test
def test_bug_449000():
# Test for sub() on escaped characters
assert re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n') == 'abc\ndef\n'
assert re.sub('\r\n', r'\n', 'abc\r\ndef\r\n') =='abc\ndef\n'
assert re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n') == 'abc\ndef\n'
assert re.sub('\r\n', '\n', 'abc\r\ndef\r\n') == 'abc\ndef\n'
test_bug_449000()
@test
def test_sub_template_numeric_escape():
# bug 776311 and friends
assert re.sub('x', r'\0', 'x') == '\0'
assert re.sub('x', r'\000', 'x') == '\000'
assert re.sub('x', r'\001', 'x') == '\001'
assert re.sub('x', r'\008', 'x') == '\0' + '8'
assert re.sub('x', r'\009', 'x') == '\0' + '9'
assert re.sub('x', r'\111', 'x') == '\111'
assert re.sub('x', r'\117', 'x') == '\117'
assert re.sub('x', r'\377', 'x') == '\377'
assert re.sub('x', r'\1111', 'x'), '\1111'
assert re.sub('x', r'\1111', 'x'), '\111' + '1'
assert re.sub('x', r'\00', 'x') == '\x00'
assert re.sub('x', r'\07', 'x') == '\x07'
assert re.sub('x', r'\08', 'x') == '\0' + '8'
assert re.sub('x', r'\09', 'x') == '\0' + '9'
assert re.sub('x', r'\0a', 'x') == '\0' + 'a'
'''
self.checkTemplateError('x', r'\400', 'x',
r'octal escape value \400 outside of '
r'range 0-0o377', 0)
self.checkTemplateError('x', r'\777', 'x',
r'octal escape value \777 outside of '
r'range 0-0o377', 0)
self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
'''
assert re.sub('(((((((((((x)))))))))))', r'\11', 'x') == 'x'
assert re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz') == 'xz8'
assert re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz') == 'xza'
test_sub_template_numeric_escape()
@test
def test_qualified_re_sub():
assert re.sub('a', 'b', 'aaaaa') == 'bbbbb'
assert re.sub('a', 'b', 'aaaaa', 1) == 'baaaa'
assert re.sub('a', 'b', 'aaaaa', count=1) == 'baaaa'
test_qualified_re_sub()
@test
def test_bug_114660():
assert re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there') == 'hello there'
test_bug_114660()
@test
def test_symbolic_refs():
assert re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx') == ''
assert re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx') == ''
# Support > 100 groups.
pat = '|'.join(f'x(?P<a{i}>{hex(i)[2:]})y' for i in range(1, 200 + 1))
assert re.sub(pat, r'\g<200>', 'xc8yzxc8y') == 'c8zc8'
test_symbolic_refs()
@test
def test_re_subn():
assert re.subn("(?i)b+", "x", "bbbb BBBB") == ('x x', 2)
assert re.subn("b+", "x", "bbbb BBBB") == ('x BBBB', 1)
assert re.subn("b+", "x", "xyz") == ('xyz', 0)
assert re.subn("b*", "x", "xyz") == ('xxxyxzx', 4)
assert re.subn("b*", "x", "xyz", 2) == ('xxxyz', 2)
assert re.subn("b*", "x", "xyz", count=2) == ('xxxyz', 2)
test_re_subn()
# TODO: Current version does not allow None == None,
# so use this as a workaround.
def cmp_opt(x, y):
if x is None:
return y is None
elif y is None:
return x is None
else:
return unwrap(x) == unwrap(y)
def cmp_list(a, b):
if len(a) != len(b):
return False
for i in range(len(a)):
if not cmp_opt(a[i], b[i]):
return False
return True
def cmp_tuple(a, b):
if staticlen(a) != staticlen(b):
return False
elif staticlen(a) == 0:
return True
else:
if not cmp_opt(a[0], b[0]):
return False
return cmp_tuple(a[1:], b[1:])
@test
def test_re_split():
assert cmp_list(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
assert cmp_list(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c'])
assert cmp_list(re.split("(:+)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c'])
assert cmp_list(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
assert cmp_list(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c'])
assert cmp_list(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c'])
assert cmp_list(re.split("(b)|(:+)", ":a:b::c"), [None, '', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c'][1:])
assert cmp_list(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c'])
for sep, expected in [
(':*', [None, '', '', 'a', '', 'b', '', 'c', ''][1:]),
('(?::*)', [None, '', '', 'a', '', 'b', '', 'c', ''][1:]),
('(:*)', [None, '', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', ''][1:]),
('(:)*', [None, '', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, ''][1:]),
]:
assert cmp_list(re.split(sep, ':a:b::c'), expected)
for sep, expected in [
('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
# (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), # TODO: this fails; re2 difference maybe?
]:
assert cmp_list(re.split(sep, ':a:b::c'), expected)
test_re_split()
@test
def test_qualified_re_split():
assert cmp_list(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
assert cmp_list(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
assert cmp_list(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
assert cmp_list(re.split("(:)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c'])
assert cmp_list(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c'])
assert cmp_list(re.split("(:*)", ":a:b::c", maxsplit=2), ['', ':', '', '', 'a:b::c'])
test_qualified_re_split()
@test
def test_re_findall():
assert re.findall(":+", "abc") == []
assert re.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
assert re.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
# (!) Note: this is different in Codon, as we always return the full match even
# if there are capturing groups.
assert re.findall("(:)(:*)", "a:b::c:::d") == [":", "::", ":::"]
x = "\xe0"
xx = x * 2
xxx = x * 3
string = f"a{x}b{xx}c{xxx}d"
assert re.findall(f"{x}+", string) == [x, xx, xxx]
assert re.findall(f"({x}+)", string) == [x, xx, xxx]
# (!) Note: same as above.
assert re.findall(f"({x})({x}*)", string) == [x, xx, xxx]
test_re_findall()
@test
def test_re_match():
assert cmp_list(re.match('a', 'a').groups(), List[Optional[str]]())
assert cmp_list(re.match('(a)', 'a').groups(), [None, 'a',][1:])
assert re.match('(a)', 'a').group(0).__val__() == 'a'
assert re.match('(a)', 'a').group(1).__val__() == 'a'
assert re.match('(a)', 'a').group(1, 1) == (Optional('a'), Optional('a'))
assert cmp_list(re.match('\xe0', '\xe0').groups(), List[Optional[str]]())
assert cmp_list(re.match('(\xe0)', '\xe0').groups(), ['\xe0'])
assert unwrap(re.match('(\xe0)', '\xe0').group(0)) == '\xe0'
assert unwrap(re.match('(\xe0)', '\xe0').group(1)) == '\xe0'
assert re.match('(\xe0)', '\xe0').group(1, 1) == (Optional('\xe0'), Optional('\xe0'))
pat = re.compile('((a)|(b))(c)?')
assert cmp_list(pat.match('a').groups(), [None, 'a', 'a', None, None][1:])
assert cmp_list(pat.match('b').groups(), [None, 'b', None, 'b', None][1:])
assert cmp_list(pat.match('ac').groups(), [None, 'a', 'a', None, 'c'][1:])
assert cmp_list(pat.match('bc').groups(), [None, 'b', None, 'b', 'c'][1:])
assert cmp_list(pat.match('bc').groups(""), [None, 'b', "", 'b', 'c'][1:])
pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
assert cmp_tuple(pat.match('a').group(1, 2, 3), (Optional('a'), Optional[str](), Optional[str]()))
assert cmp_tuple(pat.match('b').group('a1', 'b2', 'c3'), (Optional[str](), Optional('b'), Optional[str]()))
assert cmp_tuple(pat.match('ac').group(1, 'b2', 3), (Optional('a'), Optional[str](), Optional('c')))
test_re_match()
def raises(exception: type, function, *args, **kwargs):
try:
function(*args, **kwargs)
except exception:
return True
except:
pass
return False
@test
def test_group():
class Index:
value: int
def __init__(self, value):
self.value = value
def __index__(self):
return self.value
# A single group
m = re.match('(a)(b)', 'ab')
assert m.group() == 'ab'
assert unwrap(m.group(0)) == 'ab'
assert unwrap(m.group(1)) == 'a'
assert unwrap(m.group(Index(1))) == 'a'
assert raises(IndexError, m.group, -1)
assert raises(IndexError, m.group, 3)
assert raises(IndexError, m.group, 1<<1000)
assert raises(IndexError, m.group, Index(1<<1000))
assert raises(IndexError, m.group, 'x')
# Multiple groups
assert cmp_tuple(m.group(2, 1), ('b', 'a'))
assert cmp_tuple(m.group(Index(2), Index(1)), ('b', 'a'))
test_group()
@test
def test_match_getitem():
pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
m = pat.match('a')
assert unwrap(m['a1']) == 'a'
assert m['b2'] is None
assert m['c3'] is None
assert unwrap(m[0]) == 'a'
assert unwrap(m[1]) == 'a'
assert m[2] is None
assert m[3] is None
assert raises(IndexError, lambda i: m[i], 'X')
assert raises(IndexError, lambda i: m[i], -1)
assert raises(IndexError, lambda i: m[i], 4)
m = pat.match('ac')
assert unwrap(m['a1']) == 'a'
assert m['b2'] is None
assert unwrap(m['c3']) == 'c'
assert unwrap(m[0]) == 'ac'
assert unwrap(m[1]) == 'a'
assert m[2] is None
assert unwrap(m[3]) == 'c'
test_match_getitem()
@test
def test_re_fullmatch():
assert re.fullmatch(r"a", "a").span() == (0, 1)
assert re.fullmatch(r"a|ab", "ab").span() == (0, 2)
assert re.fullmatch("\xe0|\xe0\xdf", "\xe0\xdf").span() == (0, 2)
assert re.fullmatch(r".*?$", "abc").span() == (0, 3)
assert re.fullmatch(r".*?", "abc").span() == (0, 3)
assert re.fullmatch(r"a.*?b", "ab").span() == (0, 2)
assert re.fullmatch(r"a.*?b", "abb").span() == (0, 3)
assert re.fullmatch(r"a.*?b", "axxb").span() == (0, 4)
assert re.fullmatch(r"a+", "ab") is None
assert re.fullmatch(r"abc$", "abc\n") is None
assert re.fullmatch(r"(?m)abc$", "abc\n") is None
assert re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
# TODO: following fails; $ does not respect endpos in re2?
#assert re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
assert re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
test_re_fullmatch()
@test
def test_finditer():
iter = re.finditer(r":+", "a:b::c:::d")
assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"])
pat = re.compile(r":+")
iter = pat.finditer("a:b::c:::d", 1, 10)
assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"])
pat = re.compile(r":+")
iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"])
pat = re.compile(r":+")
iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
assert cmp_list([item.group(0) for item in iter], [":", "::", ":::"])
pat = re.compile(r":+")
iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
assert cmp_list([item.group(0) for item in iter], ["::", "::"])
test_finditer()
@test
def test_constants():
assert re.I == re.IGNORECASE
assert re.L == re.LOCALE
assert re.M == re.MULTILINE
assert re.S == re.DOTALL
assert re.X == re.VERBOSE
test_constants()
@test
def test_anyall():
assert unwrap(re.match("a.b", "a\nb", re.DOTALL).group(0)) == "a\nb"
assert unwrap(re.match("a.*b", "a\n\nb", re.DOTALL).group(0)) == "a\n\nb"
test_anyall()
@test
def test_groupdict():
d = re.match('(?P<first>first) (?P<second>second)', 'first second').groupdict()
assert len(d) == 2 and unwrap(d['first']) == 'first' and unwrap(d['second']) == 'second'
test_groupdict()
@test
def test_expand():
assert re.match("(?P<first>first) (?P<second>second)", "first second").expand(r"\2 \1 \g<second> \g<first>") == "second first second first"
assert re.match("(?P<first>first)|(?P<second>second)", "first").expand(r"\2 \g<second>") == " "
test_expand()
@test
def test_getattr():
assert re.compile("(?i)(a)(b)").pattern == "(?i)(a)(b)"
# TODO: Codon does not support flags like this
# self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
assert re.compile("(?i)(a)(b)").groups == 2
assert re.compile("(?i)(a)(b)").groupindex == {}
assert re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex == {'first': 1, 'other': 2}
assert re.match("(a)", "a").pos == 0
assert re.match("(a)", "a").endpos == 1
assert re.match("(a)", "a").string == "a"
assert re.match("(a)", "a").re
test_getattr()
def check_match(pattern, text, match: Optional[str] = None, span: Optional[re.Span] = None, matcher = re.fullmatch):
if match is None and span is None:
# the pattern matches the whole text
match = text
span = (0, len(text))
elif match is None or span is None:
raise ValueError('If match is not None, span should be specified '
'(and vice versa).')
m = matcher(pattern, text)
return bool(m) and m.group() == match and m.span() == span
LITERAL_CHARS = string.ascii_letters + string.digits
@test
def test_re_escape():
p = ''.join(chr(i) for i in range(256))
for c in p:
assert check_match(re.escape(c), c)
assert check_match('[' + re.escape(c) + ']', c)
assert check_match(re.escape(p), p)
for c in '-.]{}':
assert re.escape(c)[:1] == '\\'
literal_chars = LITERAL_CHARS
assert re.escape(literal_chars) == literal_chars
test_re_escape()