codon/stdlib/internal/str.codon

1485 lines
43 KiB
Python

# Copyright (C) 2022-2023 Exaloop Inc. <https://exaloop.io>
_MAX: Static[int] = 0x7FFFFFFFFFFFFFFF
@extend
class str:
# Magic methods
def __hash__(self) -> int:
h = 0
p, n = self.ptr, self.len
i = 0
while i < n:
h = 31 * h + int(p[i])
i += 1
return h
def __lt__(self, other: str) -> bool:
return self._cmp(other) < 0
def __le__(self, other: str) -> bool:
return self._cmp(other) <= 0
def __gt__(self, other: str) -> bool:
return self._cmp(other) > 0
def __ge__(self, other: str) -> bool:
return self._cmp(other) >= 0
def __repr__(self) -> str:
v = _strbuf(len(self) + 2)
q, qe = "'", "\\'"
found_single = False
found_double = False
for c in self:
if c == "'":
found_single = True
elif c == '"':
found_double = True
if found_single and not found_double:
q, qe = '"', '\\"'
v.append(q)
for c in self:
d = c
if c == "\n":
d = "\\n"
elif c == "\r":
d = "\\r"
elif c == "\t":
d = "\\t"
elif c == "\\":
d = "\\\\"
elif c == q:
d = qe
else:
b = int(c.ptr[0])
if not (32 <= b <= 126):
h = "0123456789abcdef"
v.append("\\x")
v.append(h[b // 16])
v.append(h[b % 16])
d = ""
if d:
v.append(d)
v.append(q)
return v.__str__()
def __getitem__(self, idx: int) -> str:
if idx < 0:
idx += len(self)
if not (0 <= idx < len(self)):
raise IndexError("string index out of range")
return str(self.ptr + idx, 1)
def __getitem__(self, s: Slice) -> str:
if s.start is None and s.stop is None and s.step is None:
return self.__copy__()
elif s.step is None:
start, stop, step, length = s.adjust_indices(len(self))
return str(self.ptr + start, length)
else:
start, stop, step, length = s.adjust_indices(len(self))
return self._make_from_range(start, stop, step, length)
def _make_from_range(self, start: int, stop: int, step: int, length: int) -> str:
p = Ptr[byte](length)
j = 0
for i in range(start, stop, step):
p[j] = self.ptr[i]
j += 1
return str(p, length)
def __iter__(self) -> Generator[str]:
i = 0
n = len(self)
while i < n:
yield str(self.ptr + i, 1)
i += 1
def __reversed__(self) -> Generator[str]:
i = len(self) - 1
while i >= 0:
yield str(self.ptr + i, 1)
i -= 1
def __mul__(self, x: int) -> str:
total = x * self.len
p = Ptr[byte](total)
n = 0
for _ in range(x):
str.memcpy(p + n, self.ptr, self.len)
n += self.len
return str(p, total)
def _cmp(self, other: str) -> int:
n = min(self.len, other.len)
i = 0
while i < n:
c1 = self.ptr[i]
c2 = other.ptr[i]
if c1 != c2:
return int(c1) - int(c2)
i += 1
return self.len - other.len
import algorithms.strings as algorithms
@extend
class str:
def __contains__(self, pattern: str) -> bool:
return self.find(pattern) >= 0
# Helper methods
def _isdigit(a: byte) -> bool:
return _C.isdigit(i32(int(a))) != i32(0)
def _isspace(a: byte) -> bool:
return _C.isspace(i32(int(a))) != i32(0)
def _isupper(a: byte) -> bool:
return _C.isupper(i32(int(a))) != i32(0)
def _islower(a: byte) -> bool:
return _C.islower(i32(int(a))) != i32(0)
def _isalpha(a: byte) -> bool:
return _C.isalpha(i32(int(a))) != i32(0)
def _isalnum(a: byte) -> bool:
return _C.isalnum(i32(int(a))) != i32(0)
def _toupper(a: byte) -> byte:
return byte(int(_C.toupper(i32(int(a)))))
def _tolower(a: byte) -> byte:
return byte(int(_C.tolower(i32(int(a)))))
def _slice(self, i: int, j: int) -> str:
return str(self.ptr + i, j - i)
def _at(self, i: int) -> str:
return str(self.ptr + i, 1)
def join(self, l: Generator[str]) -> str:
buf = _strbuf()
if len(self) == 0:
for a in l:
buf.append(a)
else:
first = True
for a in l:
if first:
first = False
else:
buf.append(self)
buf.append(a)
return buf.__str__()
def join(self, l: List[str]) -> str:
if len(l) == 0:
return ""
if len(l) == 1:
return l[0]
if len(self) == 0:
return str.cat(l)
# compute length
n = 0
i = 0
while i < len(l):
n += len(l[i])
if i < len(l) - 1:
n += len(self)
i += 1
# copy to new buffer
p = Ptr[byte](n)
r = 0
i = 0
while i < len(l):
str.memcpy(p + r, l[i].ptr, len(l[i]))
r += len(l[i])
if i < len(l) - 1:
str.memcpy(p + r, self.ptr, len(self))
r += len(self)
i += 1
return str(p, n)
def isdigit(self) -> bool:
"""
str.isdigit() -> bool
Return True if all characters in str are digits
and there is at least one character in str, False otherwise.
"""
if len(self) == 0:
return False
for i in range(len(self)):
if not str._isdigit(self.ptr[i]):
return False
return True
def islower(self) -> bool:
"""
str.islower() -> bool
Return True if all cased characters in str are lowercase and there is
at least one cased character in str, False otherwise.
"""
cased = False
# For empty strings
if len(self) == 0:
return False
# For single character strings
if len(self) == 1:
return str._islower(self.ptr[0])
for i in range(len(self)):
if str._isupper(self.ptr[i]):
return False
elif not cased and str._islower(self.ptr[i]):
cased = True
return cased
def isupper(self) -> bool:
"""
str.isupper() -> bool
Return True if all cased characters in str are uppercase and there is
at least one cased character in str, False otherwise.
"""
cased = False
# For empty strings
if len(self) == 0:
return False
# For single character strings
if len(self) == 1:
return str._isupper(self.ptr[0])
for i in range(len(self)):
if str._islower(self.ptr[i]):
return False
elif not cased and str._isupper(self.ptr[i]):
cased = True
return cased
def isalnum(self) -> bool:
"""
str.isalnum() -> bool
Return True if all characters in str are alphanumeric
and there is at least one character in str, False otherwise.
"""
if len(self) == 0:
return False
for i in range(len(self)):
if not str._isalnum(self.ptr[i]):
return False
return True
def isalpha(self) -> bool:
"""
str.isalpha() -> bool
Return True if all characters in str are alphabetic
and there is at least one character in str, False otherwise.
"""
if len(self) == 0:
return False
for i in range(len(self)):
if not str._isalpha(self.ptr[i]):
return False
return True
def isspace(self) -> bool:
"""
str.isspace() -> bool
Return True if all characters in str are whitespace
and there is at least one character in str, False otherwise.
"""
if len(self) == 0:
return False
for i in range(len(self)):
if not str._isspace(self.ptr[i]):
return False
return True
def istitle(self) -> bool:
"""
str.istitle() -> bool
Return True if str is a titlecased string and there is at least one
character in str, i.e. uppercase characters may only follow uncased
characters and lowercase characters only cased ones. Return False
otherwise.
"""
# For empty strings
if len(self) == 0:
return False
# For single character strings
if len(self) == 1:
return str._isupper(self.ptr[0])
cased = False
prev_is_cased = False
for i in range(len(self)):
if str._isupper(self.ptr[i]):
if prev_is_cased:
return False
prev_is_cased = True
cased = True
elif str._islower(self.ptr[i]):
if not prev_is_cased:
return False
prev_is_cased = True
cased = True
else:
prev_is_cased = False
return cased
def capitalize(self) -> str:
"""
str.capitalize() -> copy of str
Return a copy of str with only its first character capitalized (ASCII)
and the rest lower-cased.
"""
n = len(self)
if n > 0:
p = Ptr[byte](n)
p[0] = str._toupper(self.ptr[0])
for i in range(1, n):
p[i] = str._tolower(self.ptr[i])
return str(p, n)
return ""
def isdecimal(self) -> bool:
"""
str.isdecimal() -> bool
Return True if str is a decimal string, False otherwise.
str is a decimal string if all characters in str are decimal and
there is at least one character in str.
"""
if len(self) == 0:
return False
for i in range(len(self)):
# test ascii values 48-57 == 0-9
if not (48 <= int(self.ptr[i]) <= 57):
return False
return True
def lower(self) -> str:
"""
str.lower() -> copy of str
Return a copy of str with all ASCII characters converted to lowercase.
"""
# Empty string
n = len(self)
if n == 0:
return ""
p = Ptr[byte](n)
for i in range(n):
p[i] = str._tolower(self.ptr[i])
return str(p, n)
def upper(self) -> str:
"""
str.upper() -> copy of str
Return a copy of str with all ASCII characters converted to uppercase.
"""
# Empty string
n = len(self)
if n == 0:
return ""
p = Ptr[byte](n)
for i in range(n):
p[i] = str._toupper(self.ptr[i])
return str(p, n)
def isascii(self) -> bool:
"""
str.isascii() -> bool
Return True if str is empty or all characters in str are ASCII,
False otherwise.
"""
for i in range(len(self)):
if int(self.ptr[i]) >= 128:
return False
return True
def casefold(self) -> str:
"""
str.casefold() -> copy of str
Return a version of the string suitable for caseless comparisons.
Unlike Python, casefold() deals with just ASCII characters.
"""
return self.lower()
def swapcase(self) -> str:
"""
str.swapcase() -> copy of str
Return a copy of str with uppercase ASCII characters converted
to lowercase ASCII and vice versa.
"""
# Empty string
n = len(self)
if n == 0:
return ""
p = Ptr[byte](n)
for i in range(n):
if str._islower(self.ptr[i]):
p[i] = str._toupper(self.ptr[i])
elif str._isupper(self.ptr[i]):
p[i] = str._tolower(self.ptr[i])
else:
p[i] = self.ptr[i]
return str(p, n)
def title(self) -> str:
"""
str.title() -> copy of str
Return a titlecased version of str, i.e. ASCII words start with uppercase
characters, all remaining cased characters have lowercase.
"""
prev_is_cased = False
n = len(self)
if n == 0:
return ""
p = Ptr[byte](n)
for i in range(n):
if str._islower(self.ptr[i]):
# lowercase to uppercase
if not prev_is_cased:
p[i] = str._toupper(self.ptr[i])
else:
p[i] = self.ptr[i]
prev_is_cased = True
elif str._isupper(self.ptr[i]):
# uppercase to lowercase
if prev_is_cased:
p[i] = str._tolower(self.ptr[i])
else:
p[i] = self.ptr[i]
prev_is_cased = True
else:
p[i] = self.ptr[i]
prev_is_cased = False
return str(p, n)
def isnumeric(self) -> bool:
"""
str.isdecimal() -> bool
Return True if the string is a numeric string, False otherwise.
A string is numeric if all characters in the string are numeric
and there is at least one character in the string.
Unlike Python, isnumeric() deals with just ASCII characters.
"""
return self.isdecimal()
def _build(*args):
total = 0
for t in args:
if isinstance(t, str):
total += len(t)
else:
total += len(t[0]) * t[1]
p = Ptr[byte](total)
i = 0
for t in args:
if isinstance(t, str):
str.memcpy(p + i, t.ptr, t.len)
i += t.len
else:
s, n = t
for _ in range(n):
str.memcpy(p + i, s.ptr, s.len)
i += s.len
return str(p, total)
def ljust(self, width: int, fillchar: str = " ") -> str:
"""
ljust(width[, fillchar]) -> string
Return a left-justified string of length width.
Padding is done using the specified fill character (default is a space).
"""
if len(fillchar) != 1:
raise ValueError("The fill character must be exactly one character long")
if width <= len(self):
return self
return str._build(self, (fillchar, width - len(self)))
def rjust(self, width: int, fillchar: str = " ") -> str:
"""
rjust(width[, fillchar]) -> string
Return a right-justified string of length width.
Padding is done using the specified fill character (default is a space).
"""
if len(fillchar) != 1:
raise ValueError("The fill character must be exactly one character long")
if width <= len(self):
return self
return str._build((fillchar, width - len(self)), self)
def center(self, width: int, fillchar: str = " ") -> str:
"""
str.center(width[, fillchar]) -> string
Return str centered in a string of length width. Padding is
done using the specified fill character (default is a space)
"""
if len(fillchar) != 1:
raise ValueError("The fill character must be exactly one character long")
if width <= len(self):
return self
pad = width - len(self)
left_pad = pad // 2
right_pad = width - len(self) - left_pad
return str._build((fillchar, left_pad), self, (fillchar, right_pad))
def zfill(self, width: int) -> str:
"""
str.zfill(width) -> string
Pad a numeric string str with zeros on the left, to fill a field
of the specified width. The string str is never truncated.
"""
if len(self) >= width:
return self
plus = byte(43) # +
minus = byte(45) # -
zero = byte(48) # 0
zf = self.rjust(width, '0')
fill = width - len(self)
p = zf.ptr
if len(self) > 0 and (p[fill] == plus or p[fill] == minus):
p[0] = p[fill]
p[fill] = zero
return zf
def count(self, sub: str, start: int = 0, end: Optional[int] = None) -> int:
"""
str.count(sub[, start[, end]]) -> int
Return the number of occurrences of subsection sub in
bytes str[start:end]. Optional arguments start and end are interpreted
as in slice notation.
"""
end: int = end if end is not None else len(self)
start, end = self._correct_indices(start, end)
if end - start < len(sub):
return 0
return algorithms.count(self._slice(start, end), sub)
def find(self, sub: str, start: int = 0, end: Optional[int] = None) -> int:
"""
str.find(sub [,start [,end]]) -> int
Return the lowest index in str where substring sub is found,
such that sub is contained within str[start:end]. Optional
arguments start and end are interpreted as in slice notation.
Return -1 on failure.
"""
end: int = end if end is not None else len(self)
start, end = self._correct_indices(start, end)
if end - start < len(sub):
return -1
pos = algorithms.find(self._slice(start, end), sub)
return pos if pos < 0 else pos + start
def rfind(self, sub: str, start: int = 0, end: Optional[int] = None) -> int:
"""
str.rfind(sub [,start [,end]]) -> int
Return the highest index in str where substring sub is found,
such that sub is contained within str[start:end]. Optional
arguments start and end are interpreted as in slice notation.
Return -1 on failure.
"""
end: int = end if end is not None else len(self)
start, end = self._correct_indices(start, end)
if end - start < len(sub):
return -1
pos = algorithms.rfind(self._slice(start, end), sub)
return pos if pos < 0 else pos + start
def isidentifier(self) -> bool:
"""
str.isidentifier() -> bool
Return True if the string is a valid identifier, False otherwise.
Unlike Python, isidentifier() deals with just ASCII characters.
"""
# empty string
if len(self) == 0:
return False
# is not a letter or _
first = self._at(0)
if not first.isalpha():
if first != "_":
return False
if first.isalpha() or first == "_":
for i in range(1, len(self)):
ith = self._at(i)
if not ith.isalpha():
if not ith.isdecimal():
if ith != "_":
return False
return True
def isprintable(self) -> bool:
"""
str.isprintable() -> bool
Return True if the string is printable or empty, False otherwise.
Unlike Python, isprintable() deals with just ASCII characters.
"""
for i in range(len(self)):
if not (31 < int(self.ptr[i]) < 128):
return False
return True
def _has_char(self, chars: str) -> bool:
s = self._at(0)
if chars:
for c in chars:
if s == c:
return True
return False
else:
return s.isspace()
def lstrip(self, chars: str = "") -> str:
"""
str.lstrip([chars]) -> string
Return a copy of the string str with leading whitespace removed.
If chars is given, remove characters in chars instead.
Unlike Python, lstrip() deals with just ASCII characters.
"""
i = 0
while i < len(self) and self._at(i)._has_char(chars):
i += 1
return self._slice(i, len(self))
def rstrip(self, chars: str = "") -> str:
"""
str.rstrip([chars]) -> string
Return a copy of the string str with trailing whitespace removed.
If chars is given, remove characters in chars instead.
Unlike Python, rstrip() deals with just ASCII characters.
"""
i = len(self) - 1
while i >= 0 and self._at(i)._has_char(chars):
i -= 1
return self._slice(0, i + 1)
def strip(self, chars: str = "") -> str:
"""
str.strip([chars]) -> string
Return a copy of the string str with leading and trailing
whitespace removed.
If chars is given, remove characters in chars instead.
Unlike Python, strip() deals with just ASCII characters.
"""
return self.lstrip(chars).rstrip(chars)
def partition(self, sep: str) -> Tuple[str, str, str]:
"""
Search for the separator sep in str, and return the part before it,
the separator itself, and the part after it. If the separator is not
found, return str and two empty strings.
"""
if not sep:
raise ValueError("empty separator")
pos = algorithms.find(self, sep)
if pos < 0:
return self, "", ""
return self._slice(0, pos), sep, self._slice(pos + len(sep), len(self))
def rpartition(self, sep: str) -> Tuple[str, str, str]: # XXX
"""
Search for the separator sep in str, starting at the end of str, and return
the part before it, the separator itself, and the part after it. If the
separator is not found, return two empty strings and str.
"""
if not sep:
raise ValueError("empty separator")
pos = algorithms.rfind(self, sep)
if pos < 0:
return "", "", self
return self._slice(0, pos), sep, self._slice(pos + len(sep), len(self))
def split(self, sep: Optional[str] = None, maxsplit: int = -1) -> List[str]:
"""
str.split([sep [,maxsplit]]) -> list of strings
Return a list of the words in the string str, using sep as the
delimiter string. If maxsplit is given, at most maxsplit
splits are done. If sep is not specified, any
whitespace string is a separator and empty strings are removed
from the result.
"""
if sep is None:
return self._split_whitespace(
maxsplit if maxsplit >= 0 else _MAX
)
sep: str = sep
if len(sep) == 0:
raise ValueError("empty separator")
# special case for length-1 pattern
if len(sep) == 1:
return self._split_char(sep.ptr[0], maxsplit if maxsplit >= 0 else _MAX)
MAX_PREALLOC = 12
maxsplit = maxsplit if maxsplit >= 0 else _MAX
prealloc_size = MAX_PREALLOC if maxsplit >= MAX_PREALLOC else maxsplit + 1
v = List[str](capacity=prealloc_size)
i = 0
j = 0
n = len(self)
while maxsplit > 0:
maxsplit -= 1
pos = algorithms.find(self._slice(i, n), sep)
if pos < 0:
break
j = i + pos
v.append(self._slice(i, j))
i = j + len(sep)
v.append(self._slice(i, n))
return v
def rsplit(self, sep: Optional[str] = None, maxsplit: int = -1) -> List[str]:
"""
str.rsplit([sep [,maxsplit]]) -> list of strings
Return a list of the words in the string str, using sep as the
delimiter string, starting at the end of the string and working
to the front. If maxsplit is given, at most maxsplit splits are
done. If sep is not specified, any whitespace string
is a separator.
"""
if sep is None:
return self._rsplit_whitespace(
maxsplit if maxsplit >= 0 else _MAX
)
sep: str = sep
if len(sep) == 0:
raise ValueError("empty separator")
# special case for length-1 pattern
if len(sep) == 1:
return self._rsplit_char(sep.ptr[0], maxsplit if maxsplit >= 0 else _MAX)
MAX_PREALLOC = 12
maxsplit = maxsplit if maxsplit >= 0 else _MAX
prealloc_size = MAX_PREALLOC if maxsplit >= MAX_PREALLOC else maxsplit + 1
v = List[str](capacity=prealloc_size)
i = 0
j = len(self)
n = j
while maxsplit > 0:
maxsplit -= 1
pos = algorithms.rfind(self._slice(0, j), sep)
if pos < 0:
break
v.append(self._slice(pos + len(sep), j))
j = pos
v.append(self._slice(0, j))
v.reverse()
return v
def splitlines(self, keepends: bool = False) -> List[str]:
"""
str.splitlines([keepends]) -> list of strings
Return a list of the lines in str, breaking at line boundaries.
Line breaks are not included in the resulting list unless keepends
is given and true.
"""
v = []
i = 0
j = 0
n = len(self)
break_r = byte(13) # \r
break_n = byte(10) # \n
while i < n:
while i < n and not (self.ptr[i] == break_r or self.ptr[i] == break_n):
i += 1
eol = i
if i < n:
if self.ptr[i] == break_r and i + 1 < n and self.ptr[i + 1] == break_n:
i += 2
else:
i += 1
if keepends:
eol = i
if j == 0 and eol == n:
v.append(self)
break
v.append(self._slice(j, eol))
j = i
return v
def startswith(
self, prefix: str, start: int = 0, end: Optional[int] = None
) -> bool:
"""
str.startswith(prefix[, start[, end]]) -> bool
Return True if str starts with the specified prefix, False otherwise.
With optional start, test str beginning at that position.
With optional end, stop comparing str at that position.
"""
end: int = end if end is not None else len(self)
if end < 0:
end += len(self)
elif start < 0:
start += len(self)
# length prefix is longer than range of string being compared to
if start + len(prefix) > len(self):
return False
# length of prefix is longer than range of string[start:end]
if end - start < len(prefix):
return False
# prefix is an empty string
if not prefix:
return True
return prefix == self._slice(start, start + len(prefix))
def endswith(self, suffix: str, start: int = 0, end: Optional[int] = None) -> bool:
"""
str.endswith(prefix[, start[, end]]) -> bool
Return True if str ends with the specified suffix, False otherwise.
With optional start, test str beginning at that position.
With optional end, stop comparing str at that position.
"""
end: int = end if end is not None else len(self)
if end < 0:
end += len(self)
elif start < 0:
start += len(self)
if end > len(self):
end = len(self)
# length prefix is longer than range of string being compared to
if end - start < len(suffix) or start > len(self):
return False
if end - len(suffix) > start:
start = end - len(suffix)
# length of prefix is longer than range of string[start:end]
if end - start < len(suffix):
return False
# prefix is an empty string
if not suffix:
return True
return suffix == self._slice(start, start + len(suffix))
def index(self, sub: str, start: int = 0, end: Optional[int] = None) -> int:
"""
str.index(sub [,start [,end]]) -> int
Like str.find() but raise ValueError when the substring is not found.
"""
i = self.find(sub, start, end)
if i == -1:
raise ValueError("substring not found")
else:
return i
def rindex(self, sub: str, start: int = 0, end: Optional[int] = None) -> int:
"""
str.index(sub [,start [,end]]) -> int
Like str.find() but raise ValueError when the substring is not found.
"""
i = self.rfind(sub, start, end)
if i == -1:
raise ValueError("substring not found")
else:
return i
def replace(self, old: str, new: str, maxcount: int = -1) -> str:
"""
str.replace(old, new[, count]) -> string
Return a copy of string str with all occurrences of substring
old replaced by new. If the optional argument maxcount is
given, only the first maxcount occurrences are replaced.
"""
return self._replace(old, new, maxcount)
def expandtabs(self, tabsize: int = 8) -> str:
"""
str.expandtabs([tabsize]) -> string
Return a copy of str where all tab characters are expanded using spaces.
If tabsize is not given, a tab size of 8 characters is assumed.
"""
i = 0
j = 0
p = self.ptr
e = p + len(self)
break_r = byte(13) # \r
break_n = byte(10) # \n
tab = byte(9) # \t
space = byte(32) # ' '
def overflow():
raise OverflowError("result too long")
while p < e:
if p[0] == tab:
if tabsize > 0:
incr = tabsize - (j % tabsize)
if j > _MAX - incr:
overflow()
j += incr
else:
if j > _MAX - 1:
overflow()
j += 1
if p[0] == break_n or p[0] == break_r:
if i > _MAX - j:
overflow()
i += j
j = 0
p += 1
if i > _MAX - j:
overflow()
u_len = i + j
u = Ptr[byte](u_len)
j = 0
q = u
p = self.ptr
while p < e:
if p[0] == tab:
if tabsize > 0:
i = tabsize - (j % tabsize)
j += i
while True:
k = i
i -= 1
if k == 0:
break
q[0] = space
q += 1
else:
j += 1
q[0] = p[0]
q += 1
if p[0] == break_n or p[0] == break_r:
j = 0
p += 1
return str(u, u_len)
def translate(self, map) -> str:
"""
Return a copy with each character mapped by the given translation table.
"""
n = len(self)
m = 0
for i in range(n):
key = int(self.ptr[i])
if key in map:
val = map[key]
if val is not None:
m += len(val)
else:
m += 1
p = Ptr[byte](m)
q = p
for i in range(n):
key = int(self.ptr[i])
if key in map:
val = map[key]
if val is not None:
str.memcpy(q, val.ptr, len(val))
q += len(val)
else:
q[0] = self.ptr[i]
q += 1
return str(p, m)
# Internal helpers
def _correct_indices(self, start: int, end: int) -> Tuple[int, int]:
n = len(self)
if end > n:
end = n
elif end < 0:
end += n
if end < 0:
end = 0
if start < 0:
start += n
if start < 0:
start = 0
return (start, end)
def _split_whitespace(self, maxcount: int) -> List[str]:
PREALLOC_MAX = 12
l = List[str](PREALLOC_MAX if maxcount >= PREALLOC_MAX else maxcount + 1)
str_len = len(self)
i = 0
j = 0
while maxcount > 0:
maxcount -= 1
while i < str_len and str._isspace(self.ptr[i]):
i += 1
if i == str_len:
break
j = i
i += 1
while i < str_len and not str._isspace(self.ptr[i]):
i += 1
l.append(self._slice(j, i))
if i < str_len:
while i < str_len and str._isspace(self.ptr[i]):
i += 1
if i != str_len:
l.append(self._slice(i, str_len))
return l
def _rsplit_whitespace(self, maxcount: int) -> List[str]:
PREALLOC_MAX = 12
l = List[str](PREALLOC_MAX if maxcount >= PREALLOC_MAX else maxcount + 1)
str_len = len(self)
i = str_len - 1
j = str_len - 1
while maxcount > 0:
maxcount -= 1
while i >= 0 and str._isspace(self.ptr[i]):
i -= 1
if i < 0:
break
j = i
i -= 1
while i >= 0 and not str._isspace(self.ptr[i]):
i -= 1
l.append(self._slice(i + 1, j + 1))
if i >= 0:
while i >= 0 and str._isspace(self.ptr[i]):
i -= 1
if i >= 0:
l.append(self._slice(0, i + 1))
l.reverse()
return l
def _split_char(self, char: byte, maxcount: int) -> List[str]:
PREALLOC_MAX = 12
l = List[str](PREALLOC_MAX if maxcount >= PREALLOC_MAX else maxcount + 1)
str_len = len(self)
i = 0
j = 0
while i < str_len and maxcount > 0:
if self.ptr[i] == char:
l.append(self._slice(j, i))
j = i + 1
maxcount -= 1
i += 1
l.append(self._slice(j, str_len))
return l
def _rsplit_char(self, char: byte, maxcount: int) -> List[str]:
PREALLOC_MAX = 12
l = List[str](PREALLOC_MAX if maxcount >= PREALLOC_MAX else maxcount + 1)
str_len = len(self)
i = str_len - 1
j = str_len - 1
while i >= 0 and maxcount > 0:
if self.ptr[i] == char:
l.append(self._slice(i + 1, j + 1))
j = i - 1
maxcount -= 1
i -= 1
l.append(self._slice(0, j + 1))
l.reverse()
return l
def _findchar(self, c: byte):
return _C.memchr(self.ptr, i32(int(c)), len(self))
def _countchar(self, c: byte, maxcount: int):
count = 0
start = self.ptr
end = start + len(self)
while True:
start = str(start, end - start)._findchar(c)
if not start:
break
count += 1
if count >= maxcount:
break
start += 1
return count
def _replace_interleave(self, to: str, maxcount: int):
self_s = self.ptr
self_len = len(self)
to_len = len(to)
to_s = to.ptr
count = 0
i = 0
if maxcount <= self_len:
count = maxcount
else:
count = self_len + 1
# assert count > 0
if to_len > (_MAX - self_len) // count:
raise OverflowError("replace bytes is too long")
result_len = count * to_len + self_len
result_s = Ptr[byte](result_len)
result_s0 = result_s
if to_len > 1:
str.memcpy(result_s, to_s, to_len)
result_s += to_len
count -= 1
while i < count:
result_s[0] = self_s[0]
result_s += 1
self_s += 1
str.memcpy(result_s, to_s, to_len)
result_s += to_len
i += 1
else:
result_s[0] = to_s[0]
result_s += to_len
count -= 1
while i < count:
result_s[0] = self_s[0]
result_s += 1
self_s += 1
result_s[0] = to_s[0]
result_s += to_len
i += 1
str.memcpy(result_s, self_s, self_len - i)
return str(result_s0, result_len)
def _replace_delete_single_character(self, from_c: byte, maxcount: int):
self_len = len(self)
self_s = self.ptr
count = self._countchar(from_c, maxcount)
if count == 0:
return self
result_len = self_len - count
# assert result_len >= 0
result_s = Ptr[byte](result_len)
result_s0 = result_s
start = self_s
end = self_s + self_len
while count > 0:
count -= 1
nxt = str(start, end - start)._findchar(from_c)
if not nxt:
break
str.memcpy(result_s, start, nxt - start)
result_s += nxt - start
start = nxt + 1
str.memcpy(result_s, start, end - start)
return str(result_s0, result_len)
def _replace_delete_substring(self, from_s: str, maxcount: int):
self_len = len(self)
self_s = self.ptr
from_len = len(from_s)
count = algorithms.count_with_max(self, from_s, maxcount)
if count == 0:
return self
result_len = self_len - (count * from_len)
# assert result_len >= 0
result_s = Ptr[byte](result_len)
result_s0 = result_s
start = self_s
end = self_s + self_len
while count > 0:
count -= 1
offset = algorithms.find(str(start, end - start), from_s)
if offset == -1:
break
nxt = start + offset
str.memcpy(result_s, start, nxt - start)
result_s += nxt - start
start = nxt + from_len
str.memcpy(result_s, start, end - start)
return str(result_s0, result_len)
def _replace_single_character_in_place(self, from_c: byte, to_c: byte, maxcount: int):
self_s = self.ptr
self_len = len(self)
nxt = self._findchar(from_c)
if not nxt:
return self
result_s = Ptr[byte](self_len)
str.memcpy(result_s, self_s, self_len)
start = result_s + (nxt - self_s)
start[0] = to_c
start += 1
end = result_s + self_len
maxcount -= 1
while maxcount > 0:
maxcount -= 1
nxt = str(start, end - start)._findchar(from_c)
if not nxt:
break
nxt[0] = to_c
start = nxt + 1
return str(result_s, self_len)
def _replace_substring_in_place(self, from_s: str, to: str, maxcount: int):
self_s = self.ptr
self_len = len(self)
from_len = len(from_s)
to_s = to.ptr
offset = algorithms.find(self, from_s)
if offset == -1:
return self
result_s = Ptr[byte](self_len)
str.memcpy(result_s, self_s, self_len)
start = result_s + offset
str.memcpy(start, to_s, from_len)
start += from_len
end = result_s + self_len
maxcount -= 1
while maxcount > 0:
maxcount -= 1
offset = algorithms.find(str(start, end - start), from_s)
if offset == -1:
break
str.memcpy(start + offset, to_s, from_len)
start += offset + from_len
return str(result_s, self_len)
def _replace_single_character(self, from_c: byte, to_s: str, maxcount: int):
self_s = self.ptr
self_len = len(self)
to_len = len(to_s)
count = self._countchar(from_c, maxcount)
if count == 0:
return self
# assert count > 0
if to_len - 1 > (_MAX - self_len) // count:
raise OverflowError("replace bytes is too long")
result_len = self_len + count * (to_len - 1)
result_s = Ptr[byte](result_len)
result_s0 = result_s
start = self_s
end = self_s + self_len
while count > 0:
count -= 1
nxt = str(start, end - start)._findchar(from_c)
if not nxt:
break
if nxt == start:
str.memcpy(result_s, to_s.ptr, to_len)
result_s += to_len
start += 1
else:
str.memcpy(result_s, start, nxt - start)
result_s += (nxt - start)
str.memcpy(result_s, to_s.ptr, to_len)
result_s += to_len
start = nxt + 1
str.memcpy(result_s, start, end - start)
return str(result_s0, result_len)
def _replace_substring(self, from_s: str, to_s: str, maxcount: int):
self_s = self.ptr
self_len = len(self)
from_len = len(from_s)
to_len = len(to_s)
count = algorithms.count_with_max(self, from_s, maxcount)
if count == 0:
return self
# assert count > 0
if to_len - from_len > (_MAX - self_len) // count:
raise OverflowError("replace bytes is too long")
result_len = self_len + count * (to_len - from_len)
result_s = Ptr[byte](result_len)
result_s0 = result_s
start = self_s
end = self_s + self_len
while count > 0:
count -= 1
offset = algorithms.find(str(start, end - start), from_s)
if offset == -1:
break
nxt = start + offset
if nxt == start:
str.memcpy(result_s, to_s.ptr, to_len)
result_s += to_len
start += from_len
else:
str.memcpy(result_s, start, nxt - start)
result_s += (nxt - start)
str.memcpy(result_s, to_s.ptr, to_len)
result_s += to_len
start = nxt + from_len
str.memcpy(result_s, start, end - start)
return str(result_s0, result_len)
def _replace(self, from_s: str, to_s: str, maxcount: int):
self_len = len(self)
from_len = len(from_s)
to_len = len(to_s)
if self_len < from_len:
return self
if maxcount < 0:
maxcount = _MAX
elif maxcount == 0:
return self
if from_len == 0:
if to_len == 0:
return self
return self._replace_interleave(to_s, maxcount)
if to_len == 0:
if from_len == 1:
return self._replace_delete_single_character(from_s.ptr[0], maxcount)
return self._replace_delete_substring(from_s, maxcount)
if from_len == to_len:
if from_len == 1:
return self._replace_single_character_in_place(from_s.ptr[0], to_s.ptr[0], maxcount)
return self._replace_substring_in_place(from_s, to_s, maxcount)
if from_len == 1:
return self._replace_single_character(from_s.ptr[0], to_s, maxcount)
else:
return self._replace_substring(from_s, to_s, maxcount)